From 1979ebb07ed58398ad2a5038052634241b02fb95 Mon Sep 17 00:00:00 2001
From: SparrowLii <liyuan179@huawei.com>
Date: Tue, 31 Aug 2021 10:48:53 +0800
Subject: [PATCH 1/2] add vst neon instructions

---
 .../core_arch/src/aarch64/neon/generated.rs   |   132 +
 .../src/arm_shared/neon/generated.rs          | 16266 +++++++++-------
 crates/stdarch-gen/neon.spec                  |    90 +-
 crates/stdarch-gen/src/main.rs                |   709 +-
 4 files changed, 9688 insertions(+), 7509 deletions(-)

diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs
index 97e794f2a1..dd6494d5f6 100644
--- a/crates/core_arch/src/aarch64/neon/generated.rs
+++ b/crates/core_arch/src/aarch64/neon/generated.rs
@@ -4592,6 +4592,84 @@ pub unsafe fn vld1q_f64_x4(a: *const f64) -> float64x2x4_t {
     vld1q_f64_x4_(a)
 }
 
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st1))]
+pub unsafe fn vst1_f64_x2(a: *mut f64, b: float64x1x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x2.v1f64.p0f64")]
+        fn vst1_f64_x2_(a: float64x1_t, b: float64x1_t, ptr: *mut f64);
+    }
+    vst1_f64_x2_(b.0, b.1, a)
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st1))]
+pub unsafe fn vst1q_f64_x2(a: *mut f64, b: float64x2x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x2.v2f64.p0f64")]
+        fn vst1q_f64_x2_(a: float64x2_t, b: float64x2_t, ptr: *mut f64);
+    }
+    vst1q_f64_x2_(b.0, b.1, a)
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st1))]
+pub unsafe fn vst1_f64_x3(a: *mut f64, b: float64x1x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x3.v1f64.p0f64")]
+        fn vst1_f64_x3_(a: float64x1_t, b: float64x1_t, c: float64x1_t, ptr: *mut f64);
+    }
+    vst1_f64_x3_(b.0, b.1, b.2, a)
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st1))]
+pub unsafe fn vst1q_f64_x3(a: *mut f64, b: float64x2x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x3.v2f64.p0f64")]
+        fn vst1q_f64_x3_(a: float64x2_t, b: float64x2_t, c: float64x2_t, ptr: *mut f64);
+    }
+    vst1q_f64_x3_(b.0, b.1, b.2, a)
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st1))]
+pub unsafe fn vst1_f64_x4(a: *mut f64, b: float64x1x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x4.v1f64.p0f64")]
+        fn vst1_f64_x4_(a: float64x1_t, b: float64x1_t, c: float64x1_t, d: float64x1_t, ptr: *mut f64);
+    }
+    vst1_f64_x4_(b.0, b.1, b.2, b.3, a)
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st1))]
+pub unsafe fn vst1q_f64_x4(a: *mut f64, b: float64x2x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x4.v2f64.p0f64")]
+        fn vst1q_f64_x4_(a: float64x2_t, b: float64x2_t, c: float64x2_t, d: float64x2_t, ptr: *mut f64);
+    }
+    vst1q_f64_x4_(b.0, b.1, b.2, b.3, a)
+}
+
 /// Multiply
 #[inline]
 #[target_feature(enable = "neon")]
@@ -12983,6 +13061,60 @@ mod test {
         assert_eq!(r, e);
     }
 
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_f64_x2() {
+        let a: [f64; 3] = [0., 1., 2.];
+        let e: [f64; 2] = [1., 2.];
+        let mut r: [f64; 2] = [0f64; 2];
+        vst1_f64_x2(r.as_mut_ptr(), vld1_f64_x2(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_f64_x2() {
+        let a: [f64; 5] = [0., 1., 2., 3., 4.];
+        let e: [f64; 4] = [1., 2., 3., 4.];
+        let mut r: [f64; 4] = [0f64; 4];
+        vst1q_f64_x2(r.as_mut_ptr(), vld1q_f64_x2(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_f64_x3() {
+        let a: [f64; 4] = [0., 1., 2., 3.];
+        let e: [f64; 3] = [1., 2., 3.];
+        let mut r: [f64; 3] = [0f64; 3];
+        vst1_f64_x3(r.as_mut_ptr(), vld1_f64_x3(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_f64_x3() {
+        let a: [f64; 7] = [0., 1., 2., 3., 4., 5., 6.];
+        let e: [f64; 6] = [1., 2., 3., 4., 5., 6.];
+        let mut r: [f64; 6] = [0f64; 6];
+        vst1q_f64_x3(r.as_mut_ptr(), vld1q_f64_x3(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_f64_x4() {
+        let a: [f64; 5] = [0., 1., 2., 3., 4.];
+        let e: [f64; 4] = [1., 2., 3., 4.];
+        let mut r: [f64; 4] = [0f64; 4];
+        vst1_f64_x4(r.as_mut_ptr(), vld1_f64_x4(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_f64_x4() {
+        let a: [f64; 9] = [0., 1., 2., 3., 4., 5., 6., 7., 8.];
+        let e: [f64; 8] = [1., 2., 3., 4., 5., 6., 7., 8.];
+        let mut r: [f64; 8] = [0f64; 8];
+        vst1q_f64_x4(r.as_mut_ptr(), vld1q_f64_x4(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
     #[simd_test(enable = "neon")]
     unsafe fn test_vmul_f64() {
         let a: f64 = 1.0;
diff --git a/crates/core_arch/src/arm_shared/neon/generated.rs b/crates/core_arch/src/arm_shared/neon/generated.rs
index bdf8937d9d..e8b76ae377 100644
--- a/crates/core_arch/src/arm_shared/neon/generated.rs
+++ b/crates/core_arch/src/arm_shared/neon/generated.rs
@@ -6698,5218 +6698,5488 @@ pub unsafe fn vld1q_f32_x4(a: *const f32) -> float32x4x4_t {
 vld1q_f32_x4_(a)
 }
 
-/// Multiply
+/// Store multiple single-element structures from one, two, three, or four registers
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
-pub unsafe fn vmul_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
-    simd_mul(a, b)
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+pub unsafe fn vst1_s8_x2(a: *mut i8, b: int8x8x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x2.p0i8.v8i8")]
+        fn vst1_s8_x2_(ptr: *mut i8, a: int8x8_t, b: int8x8_t);
+    }
+vst1_s8_x2_(a, b.0, b.1)
 }
 
-/// Multiply
+/// Store multiple single-element structures from one, two, three, or four registers
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
-pub unsafe fn vmulq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
-    simd_mul(a, b)
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1_s8_x2(a: *mut i8, b: int8x8x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x2.v8i8.p0i8")]
+        fn vst1_s8_x2_(a: int8x8_t, b: int8x8_t, ptr: *mut i8);
+    }
+vst1_s8_x2_(b.0, b.1, a)
 }
 
-/// Multiply
+/// Store multiple single-element structures from one, two, three, or four registers
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
-pub unsafe fn vmul_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
-    simd_mul(a, b)
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+pub unsafe fn vst1_s16_x2(a: *mut i16, b: int16x4x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x2.p0i16.v4i16")]
+        fn vst1_s16_x2_(ptr: *mut i16, a: int16x4_t, b: int16x4_t);
+    }
+vst1_s16_x2_(a, b.0, b.1)
 }
 
-/// Multiply
+/// Store multiple single-element structures from one, two, three, or four registers
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
-pub unsafe fn vmulq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
-    simd_mul(a, b)
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1_s16_x2(a: *mut i16, b: int16x4x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x2.v4i16.p0i16")]
+        fn vst1_s16_x2_(a: int16x4_t, b: int16x4_t, ptr: *mut i16);
+    }
+vst1_s16_x2_(b.0, b.1, a)
 }
 
-/// Multiply
+/// Store multiple single-element structures from one, two, three, or four registers
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
-pub unsafe fn vmul_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
-    simd_mul(a, b)
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+pub unsafe fn vst1_s32_x2(a: *mut i32, b: int32x2x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x2.p0i32.v2i32")]
+        fn vst1_s32_x2_(ptr: *mut i32, a: int32x2_t, b: int32x2_t);
+    }
+vst1_s32_x2_(a, b.0, b.1)
 }
 
-/// Multiply
+/// Store multiple single-element structures from one, two, three, or four registers
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
-pub unsafe fn vmulq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
-    simd_mul(a, b)
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1_s32_x2(a: *mut i32, b: int32x2x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x2.v2i32.p0i32")]
+        fn vst1_s32_x2_(a: int32x2_t, b: int32x2_t, ptr: *mut i32);
+    }
+vst1_s32_x2_(b.0, b.1, a)
 }
 
-/// Multiply
+/// Store multiple single-element structures from one, two, three, or four registers
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
-pub unsafe fn vmul_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
-    simd_mul(a, b)
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+pub unsafe fn vst1_s64_x2(a: *mut i64, b: int64x1x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x2.p0i64.v1i64")]
+        fn vst1_s64_x2_(ptr: *mut i64, a: int64x1_t, b: int64x1_t);
+    }
+vst1_s64_x2_(a, b.0, b.1)
 }
 
-/// Multiply
+/// Store multiple single-element structures from one, two, three, or four registers
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
-pub unsafe fn vmulq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
-    simd_mul(a, b)
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1_s64_x2(a: *mut i64, b: int64x1x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x2.v1i64.p0i64")]
+        fn vst1_s64_x2_(a: int64x1_t, b: int64x1_t, ptr: *mut i64);
+    }
+vst1_s64_x2_(b.0, b.1, a)
 }
 
-/// Multiply
+/// Store multiple single-element structures from one, two, three, or four registers
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
-pub unsafe fn vmul_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
-    simd_mul(a, b)
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+pub unsafe fn vst1q_s8_x2(a: *mut i8, b: int8x16x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x2.p0i8.v16i8")]
+        fn vst1q_s8_x2_(ptr: *mut i8, a: int8x16_t, b: int8x16_t);
+    }
+vst1q_s8_x2_(a, b.0, b.1)
 }
 
-/// Multiply
+/// Store multiple single-element structures from one, two, three, or four registers
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
-pub unsafe fn vmulq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
-    simd_mul(a, b)
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1q_s8_x2(a: *mut i8, b: int8x16x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x2.v16i8.p0i8")]
+        fn vst1q_s8_x2_(a: int8x16_t, b: int8x16_t, ptr: *mut i8);
+    }
+vst1q_s8_x2_(b.0, b.1, a)
 }
 
-/// Multiply
+/// Store multiple single-element structures from one, two, three, or four registers
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
-pub unsafe fn vmul_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
-    simd_mul(a, b)
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+pub unsafe fn vst1q_s16_x2(a: *mut i16, b: int16x8x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x2.p0i16.v8i16")]
+        fn vst1q_s16_x2_(ptr: *mut i16, a: int16x8_t, b: int16x8_t);
+    }
+vst1q_s16_x2_(a, b.0, b.1)
 }
 
-/// Multiply
+/// Store multiple single-element structures from one, two, three, or four registers
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
-pub unsafe fn vmulq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
-    simd_mul(a, b)
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1q_s16_x2(a: *mut i16, b: int16x8x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x2.v8i16.p0i16")]
+        fn vst1q_s16_x2_(a: int16x8_t, b: int16x8_t, ptr: *mut i16);
+    }
+vst1q_s16_x2_(b.0, b.1, a)
 }
 
-/// Polynomial multiply
+/// Store multiple single-element structures from one, two, three, or four registers
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(pmul))]
-pub unsafe fn vmul_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t {
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+pub unsafe fn vst1q_s32_x2(a: *mut i32, b: int32x4x2_t) {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmulp.v8i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.pmul.v8i8")]
-        fn vmul_p8_(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x2.p0i32.v4i32")]
+        fn vst1q_s32_x2_(ptr: *mut i32, a: int32x4_t, b: int32x4_t);
     }
-vmul_p8_(a, b)
+vst1q_s32_x2_(a, b.0, b.1)
 }
 
-/// Polynomial multiply
+/// Store multiple single-element structures from one, two, three, or four registers
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(pmul))]
-pub unsafe fn vmulq_p8(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t {
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1q_s32_x2(a: *mut i32, b: int32x4x2_t) {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmulp.v16i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.pmul.v16i8")]
-        fn vmulq_p8_(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t;
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x2.v4i32.p0i32")]
+        fn vst1q_s32_x2_(a: int32x4_t, b: int32x4_t, ptr: *mut i32);
     }
-vmulq_p8_(a, b)
+vst1q_s32_x2_(b.0, b.1, a)
 }
 
-/// Multiply
+/// Store multiple single-element structures from one, two, three, or four registers
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.f32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul))]
-pub unsafe fn vmul_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
-    simd_mul(a, b)
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+pub unsafe fn vst1q_s64_x2(a: *mut i64, b: int64x2x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x2.p0i64.v2i64")]
+        fn vst1q_s64_x2_(ptr: *mut i64, a: int64x2_t, b: int64x2_t);
+    }
+vst1q_s64_x2_(a, b.0, b.1)
 }
 
-/// Multiply
+/// Store multiple single-element structures from one, two, three, or four registers
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.f32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul))]
-pub unsafe fn vmulq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
-    simd_mul(a, b)
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1q_s64_x2(a: *mut i64, b: int64x2x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x2.v2i64.p0i64")]
+        fn vst1q_s64_x2_(a: int64x2_t, b: int64x2_t, ptr: *mut i64);
+    }
+vst1q_s64_x2_(b.0, b.1, a)
 }
 
-/// Vector multiply by scalar
+/// Store multiple single-element structures from one, two, three, or four registers
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
-pub unsafe fn vmul_n_s16(a: int16x4_t, b: i16) -> int16x4_t {
-    simd_mul(a, vdup_n_s16(b))
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+pub unsafe fn vst1_s8_x3(a: *mut i8, b: int8x8x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x3.p0i8.v8i8")]
+        fn vst1_s8_x3_(ptr: *mut i8, a: int8x8_t, b: int8x8_t, c: int8x8_t);
+    }
+vst1_s8_x3_(a, b.0, b.1, b.2)
 }
 
-/// Vector multiply by scalar
+/// Store multiple single-element structures from one, two, three, or four registers
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
-pub unsafe fn vmulq_n_s16(a: int16x8_t, b: i16) -> int16x8_t {
-    simd_mul(a, vdupq_n_s16(b))
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1_s8_x3(a: *mut i8, b: int8x8x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x3.v8i8.p0i8")]
+        fn vst1_s8_x3_(a: int8x8_t, b: int8x8_t, c: int8x8_t, ptr: *mut i8);
+    }
+vst1_s8_x3_(b.0, b.1, b.2, a)
 }
 
-/// Vector multiply by scalar
+/// Store multiple single-element structures from one, two, three, or four registers
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
-pub unsafe fn vmul_n_s32(a: int32x2_t, b: i32) -> int32x2_t {
-    simd_mul(a, vdup_n_s32(b))
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+pub unsafe fn vst1_s16_x3(a: *mut i16, b: int16x4x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x3.p0i16.v4i16")]
+        fn vst1_s16_x3_(ptr: *mut i16, a: int16x4_t, b: int16x4_t, c: int16x4_t);
+    }
+vst1_s16_x3_(a, b.0, b.1, b.2)
 }
 
-/// Vector multiply by scalar
+/// Store multiple single-element structures from one, two, three, or four registers
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
-pub unsafe fn vmulq_n_s32(a: int32x4_t, b: i32) -> int32x4_t {
-    simd_mul(a, vdupq_n_s32(b))
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1_s16_x3(a: *mut i16, b: int16x4x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x3.v4i16.p0i16")]
+        fn vst1_s16_x3_(a: int16x4_t, b: int16x4_t, c: int16x4_t, ptr: *mut i16);
+    }
+vst1_s16_x3_(b.0, b.1, b.2, a)
 }
 
-/// Vector multiply by scalar
+/// Store multiple single-element structures from one, two, three, or four registers
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
-pub unsafe fn vmul_n_u16(a: uint16x4_t, b: u16) -> uint16x4_t {
-    simd_mul(a, vdup_n_u16(b))
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+pub unsafe fn vst1_s32_x3(a: *mut i32, b: int32x2x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x3.p0i32.v2i32")]
+        fn vst1_s32_x3_(ptr: *mut i32, a: int32x2_t, b: int32x2_t, c: int32x2_t);
+    }
+vst1_s32_x3_(a, b.0, b.1, b.2)
 }
 
-/// Vector multiply by scalar
+/// Store multiple single-element structures from one, two, three, or four registers
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
-pub unsafe fn vmulq_n_u16(a: uint16x8_t, b: u16) -> uint16x8_t {
-    simd_mul(a, vdupq_n_u16(b))
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1_s32_x3(a: *mut i32, b: int32x2x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x3.v2i32.p0i32")]
+        fn vst1_s32_x3_(a: int32x2_t, b: int32x2_t, c: int32x2_t, ptr: *mut i32);
+    }
+vst1_s32_x3_(b.0, b.1, b.2, a)
 }
 
-/// Vector multiply by scalar
+/// Store multiple single-element structures from one, two, three, or four registers
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
-pub unsafe fn vmul_n_u32(a: uint32x2_t, b: u32) -> uint32x2_t {
-    simd_mul(a, vdup_n_u32(b))
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+pub unsafe fn vst1_s64_x3(a: *mut i64, b: int64x1x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x3.p0i64.v1i64")]
+        fn vst1_s64_x3_(ptr: *mut i64, a: int64x1_t, b: int64x1_t, c: int64x1_t);
+    }
+vst1_s64_x3_(a, b.0, b.1, b.2)
 }
 
-/// Vector multiply by scalar
+/// Store multiple single-element structures from one, two, three, or four registers
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
-pub unsafe fn vmulq_n_u32(a: uint32x4_t, b: u32) -> uint32x4_t {
-    simd_mul(a, vdupq_n_u32(b))
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1_s64_x3(a: *mut i64, b: int64x1x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x3.v1i64.p0i64")]
+        fn vst1_s64_x3_(a: int64x1_t, b: int64x1_t, c: int64x1_t, ptr: *mut i64);
+    }
+vst1_s64_x3_(b.0, b.1, b.2, a)
 }
 
-/// Vector multiply by scalar
+/// Store multiple single-element structures from one, two, three, or four registers
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul))]
-pub unsafe fn vmul_n_f32(a: float32x2_t, b: f32) -> float32x2_t {
-    simd_mul(a, vdup_n_f32(b))
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+pub unsafe fn vst1q_s8_x3(a: *mut i8, b: int8x16x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x3.p0i8.v16i8")]
+        fn vst1q_s8_x3_(ptr: *mut i8, a: int8x16_t, b: int8x16_t, c: int8x16_t);
+    }
+vst1q_s8_x3_(a, b.0, b.1, b.2)
 }
 
-/// Vector multiply by scalar
+/// Store multiple single-element structures from one, two, three, or four registers
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul))]
-pub unsafe fn vmulq_n_f32(a: float32x4_t, b: f32) -> float32x4_t {
-    simd_mul(a, vdupq_n_f32(b))
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1q_s8_x3(a: *mut i8, b: int8x16x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x3.v16i8.p0i8")]
+        fn vst1q_s8_x3_(a: int8x16_t, b: int8x16_t, c: int8x16_t, ptr: *mut i8);
+    }
+vst1q_s8_x3_(b.0, b.1, b.2, a)
 }
 
-/// Multiply
+/// Store multiple single-element structures from one, two, three, or four registers
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vmul_lane_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t) -> int16x4_t {
-    static_assert_imm2!(LANE);
-    simd_mul(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+pub unsafe fn vst1q_s16_x3(a: *mut i16, b: int16x8x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x3.p0i16.v8i16")]
+        fn vst1q_s16_x3_(ptr: *mut i16, a: int16x8_t, b: int16x8_t, c: int16x8_t);
+    }
+vst1q_s16_x3_(a, b.0, b.1, b.2)
 }
 
-/// Multiply
+/// Store multiple single-element structures from one, two, three, or four registers
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vmul_laneq_s16<const LANE: i32>(a: int16x4_t, b: int16x8_t) -> int16x4_t {
-    static_assert_imm3!(LANE);
-    simd_mul(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1q_s16_x3(a: *mut i16, b: int16x8x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x3.v8i16.p0i16")]
+        fn vst1q_s16_x3_(a: int16x8_t, b: int16x8_t, c: int16x8_t, ptr: *mut i16);
+    }
+vst1q_s16_x3_(b.0, b.1, b.2, a)
 }
 
-/// Multiply
+/// Store multiple single-element structures from one, two, three, or four registers
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vmulq_lane_s16<const LANE: i32>(a: int16x8_t, b: int16x4_t) -> int16x8_t {
-    static_assert_imm2!(LANE);
-    simd_mul(a, simd_shuffle8!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+pub unsafe fn vst1q_s32_x3(a: *mut i32, b: int32x4x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x3.p0i32.v4i32")]
+        fn vst1q_s32_x3_(ptr: *mut i32, a: int32x4_t, b: int32x4_t, c: int32x4_t);
+    }
+vst1q_s32_x3_(a, b.0, b.1, b.2)
 }
 
-/// Multiply
+/// Store multiple single-element structures from one, two, three, or four registers
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vmulq_laneq_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t) -> int16x8_t {
-    static_assert_imm3!(LANE);
-    simd_mul(a, simd_shuffle8!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1q_s32_x3(a: *mut i32, b: int32x4x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x3.v4i32.p0i32")]
+        fn vst1q_s32_x3_(a: int32x4_t, b: int32x4_t, c: int32x4_t, ptr: *mut i32);
+    }
+vst1q_s32_x3_(b.0, b.1, b.2, a)
 }
 
-/// Multiply
+/// Store multiple single-element structures from one, two, three, or four registers
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vmul_lane_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t) -> int32x2_t {
-    static_assert_imm1!(LANE);
-    simd_mul(a, simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]))
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+pub unsafe fn vst1q_s64_x3(a: *mut i64, b: int64x2x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x3.p0i64.v2i64")]
+        fn vst1q_s64_x3_(ptr: *mut i64, a: int64x2_t, b: int64x2_t, c: int64x2_t);
+    }
+vst1q_s64_x3_(a, b.0, b.1, b.2)
 }
 
-/// Multiply
+/// Store multiple single-element structures from one, two, three, or four registers
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vmul_laneq_s32<const LANE: i32>(a: int32x2_t, b: int32x4_t) -> int32x2_t {
-    static_assert_imm2!(LANE);
-    simd_mul(a, simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]))
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1q_s64_x3(a: *mut i64, b: int64x2x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x3.v2i64.p0i64")]
+        fn vst1q_s64_x3_(a: int64x2_t, b: int64x2_t, c: int64x2_t, ptr: *mut i64);
+    }
+vst1q_s64_x3_(b.0, b.1, b.2, a)
 }
 
-/// Multiply
+/// Store multiple single-element structures from one, two, three, or four registers
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vmulq_lane_s32<const LANE: i32>(a: int32x4_t, b: int32x2_t) -> int32x4_t {
-    static_assert_imm1!(LANE);
-    simd_mul(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+pub unsafe fn vst1_s8_x4(a: *mut i8, b: int8x8x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x4.p0i8.v8i8")]
+        fn vst1_s8_x4_(ptr: *mut i8, a: int8x8_t, b: int8x8_t, c: int8x8_t, d: int8x8_t);
+    }
+vst1_s8_x4_(a, b.0, b.1, b.2, b.3)
 }
 
-/// Multiply
+/// Store multiple single-element structures from one, two, three, or four registers
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vmulq_laneq_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t) -> int32x4_t {
-    static_assert_imm2!(LANE);
-    simd_mul(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1_s8_x4(a: *mut i8, b: int8x8x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x4.v8i8.p0i8")]
+        fn vst1_s8_x4_(a: int8x8_t, b: int8x8_t, c: int8x8_t, d: int8x8_t, ptr: *mut i8);
+    }
+vst1_s8_x4_(b.0, b.1, b.2, b.3, a)
 }
 
-/// Multiply
+/// Store multiple single-element structures from one, two, three, or four registers
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vmul_lane_u16<const LANE: i32>(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
-    static_assert_imm2!(LANE);
-    simd_mul(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+pub unsafe fn vst1_s16_x4(a: *mut i16, b: int16x4x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x4.p0i16.v4i16")]
+        fn vst1_s16_x4_(ptr: *mut i16, a: int16x4_t, b: int16x4_t, c: int16x4_t, d: int16x4_t);
+    }
+vst1_s16_x4_(a, b.0, b.1, b.2, b.3)
 }
 
-/// Multiply
+/// Store multiple single-element structures from one, two, three, or four registers
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vmul_laneq_u16<const LANE: i32>(a: uint16x4_t, b: uint16x8_t) -> uint16x4_t {
-    static_assert_imm3!(LANE);
-    simd_mul(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1_s16_x4(a: *mut i16, b: int16x4x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x4.v4i16.p0i16")]
+        fn vst1_s16_x4_(a: int16x4_t, b: int16x4_t, c: int16x4_t, d: int16x4_t, ptr: *mut i16);
+    }
+vst1_s16_x4_(b.0, b.1, b.2, b.3, a)
 }
 
-/// Multiply
+/// Store multiple single-element structures from one, two, three, or four registers
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vmulq_lane_u16<const LANE: i32>(a: uint16x8_t, b: uint16x4_t) -> uint16x8_t {
-    static_assert_imm2!(LANE);
-    simd_mul(a, simd_shuffle8!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+pub unsafe fn vst1_s32_x4(a: *mut i32, b: int32x2x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x4.p0i32.v2i32")]
+        fn vst1_s32_x4_(ptr: *mut i32, a: int32x2_t, b: int32x2_t, c: int32x2_t, d: int32x2_t);
+    }
+vst1_s32_x4_(a, b.0, b.1, b.2, b.3)
 }
 
-/// Multiply
+/// Store multiple single-element structures from one, two, three, or four registers
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vmulq_laneq_u16<const LANE: i32>(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
-    static_assert_imm3!(LANE);
-    simd_mul(a, simd_shuffle8!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1_s32_x4(a: *mut i32, b: int32x2x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x4.v2i32.p0i32")]
+        fn vst1_s32_x4_(a: int32x2_t, b: int32x2_t, c: int32x2_t, d: int32x2_t, ptr: *mut i32);
+    }
+vst1_s32_x4_(b.0, b.1, b.2, b.3, a)
 }
 
-/// Multiply
+/// Store multiple single-element structures from one, two, three, or four registers
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vmul_lane_u32<const LANE: i32>(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
-    static_assert_imm1!(LANE);
-    simd_mul(a, simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]))
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+pub unsafe fn vst1_s64_x4(a: *mut i64, b: int64x1x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x4.p0i64.v1i64")]
+        fn vst1_s64_x4_(ptr: *mut i64, a: int64x1_t, b: int64x1_t, c: int64x1_t, d: int64x1_t);
+    }
+vst1_s64_x4_(a, b.0, b.1, b.2, b.3)
 }
 
-/// Multiply
+/// Store multiple single-element structures from one, two, three, or four registers
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vmul_laneq_u32<const LANE: i32>(a: uint32x2_t, b: uint32x4_t) -> uint32x2_t {
-    static_assert_imm2!(LANE);
-    simd_mul(a, simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]))
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1_s64_x4(a: *mut i64, b: int64x1x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x4.v1i64.p0i64")]
+        fn vst1_s64_x4_(a: int64x1_t, b: int64x1_t, c: int64x1_t, d: int64x1_t, ptr: *mut i64);
+    }
+vst1_s64_x4_(b.0, b.1, b.2, b.3, a)
 }
 
-/// Multiply
+/// Store multiple single-element structures from one, two, three, or four registers
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vmulq_lane_u32<const LANE: i32>(a: uint32x4_t, b: uint32x2_t) -> uint32x4_t {
-    static_assert_imm1!(LANE);
-    simd_mul(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+pub unsafe fn vst1q_s8_x4(a: *mut i8, b: int8x16x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x4.p0i8.v16i8")]
+        fn vst1q_s8_x4_(ptr: *mut i8, a: int8x16_t, b: int8x16_t, c: int8x16_t, d: int8x16_t);
+    }
+vst1q_s8_x4_(a, b.0, b.1, b.2, b.3)
 }
 
-/// Multiply
+/// Store multiple single-element structures from one, two, three, or four registers
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vmulq_laneq_u32<const LANE: i32>(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
-    static_assert_imm2!(LANE);
-    simd_mul(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1q_s8_x4(a: *mut i8, b: int8x16x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x4.v16i8.p0i8")]
+        fn vst1q_s8_x4_(a: int8x16_t, b: int8x16_t, c: int8x16_t, d: int8x16_t, ptr: *mut i8);
+    }
+vst1q_s8_x4_(b.0, b.1, b.2, b.3, a)
 }
 
-/// Floating-point multiply
+/// Store multiple single-element structures from one, two, three, or four registers
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 0))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul, LANE = 0))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vmul_lane_f32<const LANE: i32>(a: float32x2_t, b: float32x2_t) -> float32x2_t {
-    static_assert_imm1!(LANE);
-    simd_mul(a, simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]))
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+pub unsafe fn vst1q_s16_x4(a: *mut i16, b: int16x8x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x4.p0i16.v8i16")]
+        fn vst1q_s16_x4_(ptr: *mut i16, a: int16x8_t, b: int16x8_t, c: int16x8_t, d: int16x8_t);
+    }
+vst1q_s16_x4_(a, b.0, b.1, b.2, b.3)
 }
 
-/// Floating-point multiply
+/// Store multiple single-element structures from one, two, three, or four registers
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 0))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul, LANE = 0))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vmul_laneq_f32<const LANE: i32>(a: float32x2_t, b: float32x4_t) -> float32x2_t {
-    static_assert_imm2!(LANE);
-    simd_mul(a, simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]))
-}
-
-/// Floating-point multiply
-#[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 0))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul, LANE = 0))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vmulq_lane_f32<const LANE: i32>(a: float32x4_t, b: float32x2_t) -> float32x4_t {
-    static_assert_imm1!(LANE);
-    simd_mul(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
-}
-
-/// Floating-point multiply
-#[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 0))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul, LANE = 0))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vmulq_laneq_f32<const LANE: i32>(a: float32x4_t, b: float32x4_t) -> float32x4_t {
-    static_assert_imm2!(LANE);
-    simd_mul(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1q_s16_x4(a: *mut i16, b: int16x8x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x4.v8i16.p0i16")]
+        fn vst1q_s16_x4_(a: int16x8_t, b: int16x8_t, c: int16x8_t, d: int16x8_t, ptr: *mut i16);
+    }
+vst1q_s16_x4_(b.0, b.1, b.2, b.3, a)
 }
 
-/// Signed multiply long
+/// Store multiple single-element structures from one, two, three, or four registers
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmull.s8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smull))]
-pub unsafe fn vmull_s8(a: int8x8_t, b: int8x8_t) -> int16x8_t {
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+pub unsafe fn vst1q_s32_x4(a: *mut i32, b: int32x4x4_t) {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmulls.v8i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smull.v8i8")]
-        fn vmull_s8_(a: int8x8_t, b: int8x8_t) -> int16x8_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x4.p0i32.v4i32")]
+        fn vst1q_s32_x4_(ptr: *mut i32, a: int32x4_t, b: int32x4_t, c: int32x4_t, d: int32x4_t);
     }
-vmull_s8_(a, b)
+vst1q_s32_x4_(a, b.0, b.1, b.2, b.3)
 }
 
-/// Signed multiply long
+/// Store multiple single-element structures from one, two, three, or four registers
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmull.s16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smull))]
-pub unsafe fn vmull_s16(a: int16x4_t, b: int16x4_t) -> int32x4_t {
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1q_s32_x4(a: *mut i32, b: int32x4x4_t) {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmulls.v4i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smull.v4i16")]
-        fn vmull_s16_(a: int16x4_t, b: int16x4_t) -> int32x4_t;
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x4.v4i32.p0i32")]
+        fn vst1q_s32_x4_(a: int32x4_t, b: int32x4_t, c: int32x4_t, d: int32x4_t, ptr: *mut i32);
     }
-vmull_s16_(a, b)
+vst1q_s32_x4_(b.0, b.1, b.2, b.3, a)
 }
 
-/// Signed multiply long
+/// Store multiple single-element structures from one, two, three, or four registers
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmull.s32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smull))]
-pub unsafe fn vmull_s32(a: int32x2_t, b: int32x2_t) -> int64x2_t {
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+pub unsafe fn vst1q_s64_x4(a: *mut i64, b: int64x2x4_t) {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmulls.v2i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smull.v2i32")]
-        fn vmull_s32_(a: int32x2_t, b: int32x2_t) -> int64x2_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x4.p0i64.v2i64")]
+        fn vst1q_s64_x4_(ptr: *mut i64, a: int64x2_t, b: int64x2_t, c: int64x2_t, d: int64x2_t);
     }
-vmull_s32_(a, b)
+vst1q_s64_x4_(a, b.0, b.1, b.2, b.3)
 }
 
-/// Unsigned multiply long
+/// Store multiple single-element structures from one, two, three, or four registers
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmull.u8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umull))]
-pub unsafe fn vmull_u8(a: uint8x8_t, b: uint8x8_t) -> uint16x8_t {
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1q_s64_x4(a: *mut i64, b: int64x2x4_t) {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmullu.v8i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umull.v8i8")]
-        fn vmull_u8_(a: uint8x8_t, b: uint8x8_t) -> uint16x8_t;
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x4.v2i64.p0i64")]
+        fn vst1q_s64_x4_(a: int64x2_t, b: int64x2_t, c: int64x2_t, d: int64x2_t, ptr: *mut i64);
     }
-vmull_u8_(a, b)
+vst1q_s64_x4_(b.0, b.1, b.2, b.3, a)
 }
 
-/// Unsigned multiply long
+/// Store multiple single-element structures to one, two, three, or four registers
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmull.u16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umull))]
-pub unsafe fn vmull_u16(a: uint16x4_t, b: uint16x4_t) -> uint32x4_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmullu.v4i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umull.v4i16")]
-        fn vmull_u16_(a: uint16x4_t, b: uint16x4_t) -> uint32x4_t;
-    }
-vmull_u16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1_u8_x2(a: *mut u8, b: uint8x8x2_t) {
+    vst1_s8_x2(transmute(a), transmute(b))
 }
 
-/// Unsigned multiply long
+/// Store multiple single-element structures to one, two, three, or four registers
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmull.u32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umull))]
-pub unsafe fn vmull_u32(a: uint32x2_t, b: uint32x2_t) -> uint64x2_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmullu.v2i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umull.v2i32")]
-        fn vmull_u32_(a: uint32x2_t, b: uint32x2_t) -> uint64x2_t;
-    }
-vmull_u32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1_u16_x2(a: *mut u16, b: uint16x4x2_t) {
+    vst1_s16_x2(transmute(a), transmute(b))
 }
 
-/// Polynomial multiply long
+/// Store multiple single-element structures to one, two, three, or four registers
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmull.p8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(pmull))]
-pub unsafe fn vmull_p8(a: poly8x8_t, b: poly8x8_t) -> poly16x8_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmullp.v8i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.pmull.v8i8")]
-        fn vmull_p8_(a: poly8x8_t, b: poly8x8_t) -> poly16x8_t;
-    }
-vmull_p8_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1_u32_x2(a: *mut u32, b: uint32x2x2_t) {
+    vst1_s32_x2(transmute(a), transmute(b))
 }
 
-/// Vector long multiply with scalar
+/// Store multiple single-element structures to one, two, three, or four registers
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smull))]
-pub unsafe fn vmullh_n_s16(a: int16x4_t, b: i16) -> int32x4_t {
-    vmull_s16(a, vdup_n_s16(b))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1_u64_x2(a: *mut u64, b: uint64x1x2_t) {
+    vst1_s64_x2(transmute(a), transmute(b))
 }
 
-/// Vector long multiply with scalar
+/// Store multiple single-element structures to one, two, three, or four registers
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smull))]
-pub unsafe fn vmulls_n_s32(a: int32x2_t, b: i32) -> int64x2_t {
-    vmull_s32(a, vdup_n_s32(b))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1q_u8_x2(a: *mut u8, b: uint8x16x2_t) {
+    vst1q_s8_x2(transmute(a), transmute(b))
 }
 
-/// Vector long multiply with scalar
+/// Store multiple single-element structures to one, two, three, or four registers
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umull))]
-pub unsafe fn vmullh_n_u16(a: uint16x4_t, b: u16) -> uint32x4_t {
-    vmull_u16(a, vdup_n_u16(b))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1q_u16_x2(a: *mut u16, b: uint16x8x2_t) {
+    vst1q_s16_x2(transmute(a), transmute(b))
 }
 
-/// Vector long multiply with scalar
+/// Store multiple single-element structures to one, two, three, or four registers
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umull))]
-pub unsafe fn vmulls_n_u32(a: uint32x2_t, b: u32) -> uint64x2_t {
-    vmull_u32(a, vdup_n_u32(b))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1q_u32_x2(a: *mut u32, b: uint32x4x2_t) {
+    vst1q_s32_x2(transmute(a), transmute(b))
 }
 
-/// Vector long multiply by scalar
+/// Store multiple single-element structures to one, two, three, or four registers
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull, LANE = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smull, LANE = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vmull_lane_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t) -> int32x4_t {
-    static_assert_imm2!(LANE);
-    vmull_s16(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1q_u64_x2(a: *mut u64, b: uint64x2x2_t) {
+    vst1q_s64_x2(transmute(a), transmute(b))
 }
 
-/// Vector long multiply by scalar
+/// Store multiple single-element structures to one, two, three, or four registers
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull, LANE = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smull, LANE = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vmull_laneq_s16<const LANE: i32>(a: int16x4_t, b: int16x8_t) -> int32x4_t {
-    static_assert_imm3!(LANE);
-    vmull_s16(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1_u8_x3(a: *mut u8, b: uint8x8x3_t) {
+    vst1_s8_x3(transmute(a), transmute(b))
 }
 
-/// Vector long multiply by scalar
+/// Store multiple single-element structures to one, two, three, or four registers
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull, LANE = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smull, LANE = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vmull_lane_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t) -> int64x2_t {
-    static_assert_imm1!(LANE);
-    vmull_s32(a, simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1_u16_x3(a: *mut u16, b: uint16x4x3_t) {
+    vst1_s16_x3(transmute(a), transmute(b))
 }
 
-/// Vector long multiply by scalar
+/// Store multiple single-element structures to one, two, three, or four registers
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull, LANE = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smull, LANE = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vmull_laneq_s32<const LANE: i32>(a: int32x2_t, b: int32x4_t) -> int64x2_t {
-    static_assert_imm2!(LANE);
-    vmull_s32(a, simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1_u32_x3(a: *mut u32, b: uint32x2x3_t) {
+    vst1_s32_x3(transmute(a), transmute(b))
 }
 
-/// Vector long multiply by scalar
+/// Store multiple single-element structures to one, two, three, or four registers
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull, LANE = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umull, LANE = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vmull_lane_u16<const LANE: i32>(a: uint16x4_t, b: uint16x4_t) -> uint32x4_t {
-    static_assert_imm2!(LANE);
-    vmull_u16(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1_u64_x3(a: *mut u64, b: uint64x1x3_t) {
+    vst1_s64_x3(transmute(a), transmute(b))
 }
 
-/// Vector long multiply by scalar
+/// Store multiple single-element structures to one, two, three, or four registers
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull, LANE = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umull, LANE = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vmull_laneq_u16<const LANE: i32>(a: uint16x4_t, b: uint16x8_t) -> uint32x4_t {
-    static_assert_imm3!(LANE);
-    vmull_u16(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1q_u8_x3(a: *mut u8, b: uint8x16x3_t) {
+    vst1q_s8_x3(transmute(a), transmute(b))
 }
 
-/// Vector long multiply by scalar
+/// Store multiple single-element structures to one, two, three, or four registers
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull, LANE = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umull, LANE = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vmull_lane_u32<const LANE: i32>(a: uint32x2_t, b: uint32x2_t) -> uint64x2_t {
-    static_assert_imm1!(LANE);
-    vmull_u32(a, simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1q_u16_x3(a: *mut u16, b: uint16x8x3_t) {
+    vst1q_s16_x3(transmute(a), transmute(b))
 }
 
-/// Vector long multiply by scalar
+/// Store multiple single-element structures to one, two, three, or four registers
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull, LANE = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umull, LANE = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vmull_laneq_u32<const LANE: i32>(a: uint32x2_t, b: uint32x4_t) -> uint64x2_t {
-    static_assert_imm2!(LANE);
-    vmull_u32(a, simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1q_u32_x3(a: *mut u32, b: uint32x4x3_t) {
+    vst1q_s32_x3(transmute(a), transmute(b))
 }
 
-/// Floating-point fused Multiply-Add to accumulator(vector)
+/// Store multiple single-element structures to one, two, three, or four registers
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfma))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmla))]
-pub unsafe fn vfma_f32(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.fma.v2f32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.fma.v2f32")]
-        fn vfma_f32_(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t;
-    }
-vfma_f32_(b, c, a)
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1q_u64_x3(a: *mut u64, b: uint64x2x3_t) {
+    vst1q_s64_x3(transmute(a), transmute(b))
 }
 
-/// Floating-point fused Multiply-Add to accumulator(vector)
+/// Store multiple single-element structures to one, two, three, or four registers
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfma))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmla))]
-pub unsafe fn vfmaq_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.fma.v4f32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.fma.v4f32")]
-        fn vfmaq_f32_(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t;
-    }
-vfmaq_f32_(b, c, a)
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1_u8_x4(a: *mut u8, b: uint8x8x4_t) {
+    vst1_s8_x4(transmute(a), transmute(b))
 }
 
-/// Floating-point fused Multiply-Add to accumulator(vector)
+/// Store multiple single-element structures to one, two, three, or four registers
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfma))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmla))]
-pub unsafe fn vfma_n_f32(a: float32x2_t, b: float32x2_t, c: f32) -> float32x2_t {
-    vfma_f32(a, b, vdup_n_f32(c))
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1_u16_x4(a: *mut u16, b: uint16x4x4_t) {
+    vst1_s16_x4(transmute(a), transmute(b))
 }
 
-/// Floating-point fused Multiply-Add to accumulator(vector)
+/// Store multiple single-element structures to one, two, three, or four registers
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfma))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmla))]
-pub unsafe fn vfmaq_n_f32(a: float32x4_t, b: float32x4_t, c: f32) -> float32x4_t {
-    vfmaq_f32(a, b, vdupq_n_f32(c))
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1_u32_x4(a: *mut u32, b: uint32x2x4_t) {
+    vst1_s32_x4(transmute(a), transmute(b))
 }
 
-/// Floating-point fused multiply-subtract from accumulator
+/// Store multiple single-element structures to one, two, three, or four registers
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfms))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmls))]
-pub unsafe fn vfms_f32(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t {
-    let b: float32x2_t = simd_neg(b);
-    vfma_f32(a, b, c)
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1_u64_x4(a: *mut u64, b: uint64x1x4_t) {
+    vst1_s64_x4(transmute(a), transmute(b))
 }
 
-/// Floating-point fused multiply-subtract from accumulator
+/// Store multiple single-element structures to one, two, three, or four registers
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfms))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmls))]
-pub unsafe fn vfmsq_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t {
-    let b: float32x4_t = simd_neg(b);
-    vfmaq_f32(a, b, c)
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1q_u8_x4(a: *mut u8, b: uint8x16x4_t) {
+    vst1q_s8_x4(transmute(a), transmute(b))
 }
 
-/// Floating-point fused Multiply-subtract to accumulator(vector)
+/// Store multiple single-element structures to one, two, three, or four registers
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfms))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmls))]
-pub unsafe fn vfms_n_f32(a: float32x2_t, b: float32x2_t, c: f32) -> float32x2_t {
-    vfms_f32(a, b, vdup_n_f32(c))
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1q_u16_x4(a: *mut u16, b: uint16x8x4_t) {
+    vst1q_s16_x4(transmute(a), transmute(b))
 }
 
-/// Floating-point fused Multiply-subtract to accumulator(vector)
+/// Store multiple single-element structures to one, two, three, or four registers
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfms))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmls))]
-pub unsafe fn vfmsq_n_f32(a: float32x4_t, b: float32x4_t, c: f32) -> float32x4_t {
-    vfmsq_f32(a, b, vdupq_n_f32(c))
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1q_u32_x4(a: *mut u32, b: uint32x4x4_t) {
+    vst1q_s32_x4(transmute(a), transmute(b))
 }
 
-/// Subtract
+/// Store multiple single-element structures to one, two, three, or four registers
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
-pub unsafe fn vsub_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
-    simd_sub(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1q_u64_x4(a: *mut u64, b: uint64x2x4_t) {
+    vst1q_s64_x4(transmute(a), transmute(b))
 }
 
-/// Subtract
+/// Store multiple single-element structures to one, two, three, or four registers
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
-pub unsafe fn vsubq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
-    simd_sub(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1_p8_x2(a: *mut p8, b: poly8x8x2_t) {
+    vst1_s8_x2(transmute(a), transmute(b))
 }
 
-/// Subtract
+/// Store multiple single-element structures to one, two, three, or four registers
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
-pub unsafe fn vsub_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
-    simd_sub(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1_p8_x3(a: *mut p8, b: poly8x8x3_t) {
+    vst1_s8_x3(transmute(a), transmute(b))
 }
 
-/// Subtract
+/// Store multiple single-element structures to one, two, three, or four registers
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
-pub unsafe fn vsubq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
-    simd_sub(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1_p8_x4(a: *mut p8, b: poly8x8x4_t) {
+    vst1_s8_x4(transmute(a), transmute(b))
 }
 
-/// Subtract
+/// Store multiple single-element structures to one, two, three, or four registers
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
-pub unsafe fn vsub_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
-    simd_sub(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1q_p8_x2(a: *mut p8, b: poly8x16x2_t) {
+    vst1q_s8_x2(transmute(a), transmute(b))
 }
 
-/// Subtract
+/// Store multiple single-element structures to one, two, three, or four registers
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
-pub unsafe fn vsubq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
-    simd_sub(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1q_p8_x3(a: *mut p8, b: poly8x16x3_t) {
+    vst1q_s8_x3(transmute(a), transmute(b))
 }
 
-/// Subtract
+/// Store multiple single-element structures to one, two, three, or four registers
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
-pub unsafe fn vsub_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
-    simd_sub(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1q_p8_x4(a: *mut p8, b: poly8x16x4_t) {
+    vst1q_s8_x4(transmute(a), transmute(b))
 }
 
-/// Subtract
+/// Store multiple single-element structures to one, two, three, or four registers
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
-pub unsafe fn vsubq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
-    simd_sub(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1_p16_x2(a: *mut p16, b: poly16x4x2_t) {
+    vst1_s16_x2(transmute(a), transmute(b))
 }
 
-/// Subtract
+/// Store multiple single-element structures to one, two, three, or four registers
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
-pub unsafe fn vsub_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
-    simd_sub(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1_p16_x3(a: *mut p16, b: poly16x4x3_t) {
+    vst1_s16_x3(transmute(a), transmute(b))
 }
 
-/// Subtract
+/// Store multiple single-element structures to one, two, three, or four registers
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
-pub unsafe fn vsubq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
-    simd_sub(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1_p16_x4(a: *mut p16, b: poly16x4x4_t) {
+    vst1_s16_x4(transmute(a), transmute(b))
 }
 
-/// Subtract
+/// Store multiple single-element structures to one, two, three, or four registers
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
-pub unsafe fn vsub_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
-    simd_sub(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1q_p16_x2(a: *mut p16, b: poly16x8x2_t) {
+    vst1q_s16_x2(transmute(a), transmute(b))
 }
 
-/// Subtract
+/// Store multiple single-element structures to one, two, three, or four registers
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
-pub unsafe fn vsubq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
-    simd_sub(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1q_p16_x3(a: *mut p16, b: poly16x8x3_t) {
+    vst1q_s16_x3(transmute(a), transmute(b))
 }
 
-/// Subtract
+/// Store multiple single-element structures to one, two, three, or four registers
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i64"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
-pub unsafe fn vsub_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t {
-    simd_sub(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1q_p16_x4(a: *mut p16, b: poly16x8x4_t) {
+    vst1q_s16_x4(transmute(a), transmute(b))
 }
 
-/// Subtract
+/// Store multiple single-element structures to one, two, three, or four registers
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i64"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
-pub unsafe fn vsubq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
-    simd_sub(a, b)
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+pub unsafe fn vst1_f32_x2(a: *mut f32, b: float32x2x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x2.p0f32.v2f32")]
+        fn vst1_f32_x2_(ptr: *mut f32, a: float32x2_t, b: float32x2_t);
+    }
+vst1_f32_x2_(a, b.0, b.1)
 }
 
-/// Subtract
+/// Store multiple single-element structures to one, two, three, or four registers
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i64"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
-pub unsafe fn vsub_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
-    simd_sub(a, b)
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1_f32_x2(a: *mut f32, b: float32x2x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x2.v2f32.p0f32")]
+        fn vst1_f32_x2_(a: float32x2_t, b: float32x2_t, ptr: *mut f32);
+    }
+vst1_f32_x2_(b.0, b.1, a)
 }
 
-/// Subtract
+/// Store multiple single-element structures to one, two, three, or four registers
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i64"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
-pub unsafe fn vsubq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
-    simd_sub(a, b)
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+pub unsafe fn vst1q_f32_x2(a: *mut f32, b: float32x4x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x2.p0f32.v4f32")]
+        fn vst1q_f32_x2_(ptr: *mut f32, a: float32x4_t, b: float32x4_t);
+    }
+vst1q_f32_x2_(a, b.0, b.1)
 }
 
-/// Subtract
+/// Store multiple single-element structures to one, two, three, or four registers
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.f32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fsub))]
-pub unsafe fn vsub_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
-    simd_sub(a, b)
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1q_f32_x2(a: *mut f32, b: float32x4x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x2.v4f32.p0f32")]
+        fn vst1q_f32_x2_(a: float32x4_t, b: float32x4_t, ptr: *mut f32);
+    }
+vst1q_f32_x2_(b.0, b.1, a)
 }
 
-/// Subtract
+/// Store multiple single-element structures to one, two, three, or four registers
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.f32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fsub))]
-pub unsafe fn vsubq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
-    simd_sub(a, b)
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+pub unsafe fn vst1_f32_x3(a: *mut f32, b: float32x2x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x3.p0f32.v2f32")]
+        fn vst1_f32_x3_(ptr: *mut f32, a: float32x2_t, b: float32x2_t, c: float32x2_t);
+    }
+vst1_f32_x3_(a, b.0, b.1, b.2)
 }
 
-/// Subtract returning high narrow
+/// Store multiple single-element structures to one, two, three, or four registers
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn))]
-pub unsafe fn vsubhn_s16(a: int16x8_t, b: int16x8_t) -> int8x8_t {
-    let c: i16x8 = i16x8::new(8, 8, 8, 8, 8, 8, 8, 8);
-    simd_cast(simd_shr(simd_sub(a, b), transmute(c)))
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1_f32_x3(a: *mut f32, b: float32x2x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x3.v2f32.p0f32")]
+        fn vst1_f32_x3_(a: float32x2_t, b: float32x2_t, c: float32x2_t, ptr: *mut f32);
+    }
+vst1_f32_x3_(b.0, b.1, b.2, a)
 }
 
-/// Subtract returning high narrow
+/// Store multiple single-element structures to one, two, three, or four registers
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn))]
-pub unsafe fn vsubhn_s32(a: int32x4_t, b: int32x4_t) -> int16x4_t {
-    let c: i32x4 = i32x4::new(16, 16, 16, 16);
-    simd_cast(simd_shr(simd_sub(a, b), transmute(c)))
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+pub unsafe fn vst1q_f32_x3(a: *mut f32, b: float32x4x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x3.p0f32.v4f32")]
+        fn vst1q_f32_x3_(ptr: *mut f32, a: float32x4_t, b: float32x4_t, c: float32x4_t);
+    }
+vst1q_f32_x3_(a, b.0, b.1, b.2)
 }
 
-/// Subtract returning high narrow
+/// Store multiple single-element structures to one, two, three, or four registers
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn))]
-pub unsafe fn vsubhn_s64(a: int64x2_t, b: int64x2_t) -> int32x2_t {
-    let c: i64x2 = i64x2::new(32, 32);
-    simd_cast(simd_shr(simd_sub(a, b), transmute(c)))
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1q_f32_x3(a: *mut f32, b: float32x4x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x3.v4f32.p0f32")]
+        fn vst1q_f32_x3_(a: float32x4_t, b: float32x4_t, c: float32x4_t, ptr: *mut f32);
+    }
+vst1q_f32_x3_(b.0, b.1, b.2, a)
 }
 
-/// Subtract returning high narrow
+/// Store multiple single-element structures to one, two, three, or four registers
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn))]
-pub unsafe fn vsubhn_u16(a: uint16x8_t, b: uint16x8_t) -> uint8x8_t {
-    let c: u16x8 = u16x8::new(8, 8, 8, 8, 8, 8, 8, 8);
-    simd_cast(simd_shr(simd_sub(a, b), transmute(c)))
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+pub unsafe fn vst1_f32_x4(a: *mut f32, b: float32x2x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x4.p0f32.v2f32")]
+        fn vst1_f32_x4_(ptr: *mut f32, a: float32x2_t, b: float32x2_t, c: float32x2_t, d: float32x2_t);
+    }
+vst1_f32_x4_(a, b.0, b.1, b.2, b.3)
 }
 
-/// Subtract returning high narrow
+/// Store multiple single-element structures to one, two, three, or four registers
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn))]
-pub unsafe fn vsubhn_u32(a: uint32x4_t, b: uint32x4_t) -> uint16x4_t {
-    let c: u32x4 = u32x4::new(16, 16, 16, 16);
-    simd_cast(simd_shr(simd_sub(a, b), transmute(c)))
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1_f32_x4(a: *mut f32, b: float32x2x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x4.v2f32.p0f32")]
+        fn vst1_f32_x4_(a: float32x2_t, b: float32x2_t, c: float32x2_t, d: float32x2_t, ptr: *mut f32);
+    }
+vst1_f32_x4_(b.0, b.1, b.2, b.3, a)
 }
 
-/// Subtract returning high narrow
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+pub unsafe fn vst1q_f32_x4(a: *mut f32, b: float32x4x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x4.p0f32.v4f32")]
+        fn vst1q_f32_x4_(ptr: *mut f32, a: float32x4_t, b: float32x4_t, c: float32x4_t, d: float32x4_t);
+    }
+vst1q_f32_x4_(a, b.0, b.1, b.2, b.3)
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn))]
-pub unsafe fn vsubhn_u64(a: uint64x2_t, b: uint64x2_t) -> uint32x2_t {
-    let c: u64x2 = u64x2::new(32, 32);
-    simd_cast(simd_shr(simd_sub(a, b), transmute(c)))
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1q_f32_x4(a: *mut f32, b: float32x4x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x4.v4f32.p0f32")]
+        fn vst1q_f32_x4_(a: float32x4_t, b: float32x4_t, c: float32x4_t, d: float32x4_t, ptr: *mut f32);
+    }
+vst1q_f32_x4_(b.0, b.1, b.2, b.3, a)
 }
 
-/// Subtract returning high narrow
+/// Multiply
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn2))]
-pub unsafe fn vsubhn_high_s16(a: int8x8_t, b: int16x8_t, c: int16x8_t) -> int8x16_t {
-    let d: int8x8_t = vsubhn_s16(b, c);
-    simd_shuffle16!(a, d, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+pub unsafe fn vmul_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    simd_mul(a, b)
 }
 
-/// Subtract returning high narrow
+/// Multiply
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn2))]
-pub unsafe fn vsubhn_high_s32(a: int16x4_t, b: int32x4_t, c: int32x4_t) -> int16x8_t {
-    let d: int16x4_t = vsubhn_s32(b, c);
-    simd_shuffle8!(a, d, [0, 1, 2, 3, 4, 5, 6, 7])
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+pub unsafe fn vmulq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    simd_mul(a, b)
 }
 
-/// Subtract returning high narrow
+/// Multiply
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn2))]
-pub unsafe fn vsubhn_high_s64(a: int32x2_t, b: int64x2_t, c: int64x2_t) -> int32x4_t {
-    let d: int32x2_t = vsubhn_s64(b, c);
-    simd_shuffle4!(a, d, [0, 1, 2, 3])
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+pub unsafe fn vmul_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    simd_mul(a, b)
 }
 
-/// Subtract returning high narrow
+/// Multiply
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn2))]
-pub unsafe fn vsubhn_high_u16(a: uint8x8_t, b: uint16x8_t, c: uint16x8_t) -> uint8x16_t {
-    let d: uint8x8_t = vsubhn_u16(b, c);
-    simd_shuffle16!(a, d, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+pub unsafe fn vmulq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    simd_mul(a, b)
 }
 
-/// Subtract returning high narrow
+/// Multiply
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn2))]
-pub unsafe fn vsubhn_high_u32(a: uint16x4_t, b: uint32x4_t, c: uint32x4_t) -> uint16x8_t {
-    let d: uint16x4_t = vsubhn_u32(b, c);
-    simd_shuffle8!(a, d, [0, 1, 2, 3, 4, 5, 6, 7])
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+pub unsafe fn vmul_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    simd_mul(a, b)
 }
 
-/// Subtract returning high narrow
+/// Multiply
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn2))]
-pub unsafe fn vsubhn_high_u64(a: uint32x2_t, b: uint64x2_t, c: uint64x2_t) -> uint32x4_t {
-    let d: uint32x2_t = vsubhn_u64(b, c);
-    simd_shuffle4!(a, d, [0, 1, 2, 3])
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+pub unsafe fn vmulq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    simd_mul(a, b)
 }
 
-/// Signed halving subtract
+/// Multiply
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.u8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uhsub))]
-pub unsafe fn vhsub_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubu.v8i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uhsub.v8i8")]
-        fn vhsub_u8_(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t;
-    }
-vhsub_u8_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+pub unsafe fn vmul_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    simd_mul(a, b)
 }
 
-/// Signed halving subtract
+/// Multiply
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.u8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uhsub))]
-pub unsafe fn vhsubq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubu.v16i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uhsub.v16i8")]
-        fn vhsubq_u8_(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t;
-    }
-vhsubq_u8_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+pub unsafe fn vmulq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    simd_mul(a, b)
 }
 
-/// Signed halving subtract
+/// Multiply
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.u16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uhsub))]
-pub unsafe fn vhsub_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubu.v4i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uhsub.v4i16")]
-        fn vhsub_u16_(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t;
-    }
-vhsub_u16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+pub unsafe fn vmul_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    simd_mul(a, b)
 }
 
-/// Signed halving subtract
+/// Multiply
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.u16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uhsub))]
-pub unsafe fn vhsubq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubu.v8i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uhsub.v8i16")]
-        fn vhsubq_u16_(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t;
-    }
-vhsubq_u16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+pub unsafe fn vmulq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    simd_mul(a, b)
 }
 
-/// Signed halving subtract
+/// Multiply
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.u32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uhsub))]
-pub unsafe fn vhsub_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubu.v2i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uhsub.v2i32")]
-        fn vhsub_u32_(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t;
-    }
-vhsub_u32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+pub unsafe fn vmul_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    simd_mul(a, b)
 }
 
-/// Signed halving subtract
+/// Multiply
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.u32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uhsub))]
-pub unsafe fn vhsubq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubu.v4i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uhsub.v4i32")]
-        fn vhsubq_u32_(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t;
-    }
-vhsubq_u32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+pub unsafe fn vmulq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    simd_mul(a, b)
 }
 
-/// Signed halving subtract
+/// Polynomial multiply
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.s8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shsub))]
-pub unsafe fn vhsub_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(pmul))]
+pub unsafe fn vmul_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubs.v8i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.shsub.v8i8")]
-        fn vhsub_s8_(a: int8x8_t, b: int8x8_t) -> int8x8_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmulp.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.pmul.v8i8")]
+        fn vmul_p8_(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t;
     }
-vhsub_s8_(a, b)
+vmul_p8_(a, b)
 }
 
-/// Signed halving subtract
+/// Polynomial multiply
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.s8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shsub))]
-pub unsafe fn vhsubq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(pmul))]
+pub unsafe fn vmulq_p8(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubs.v16i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.shsub.v16i8")]
-        fn vhsubq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmulp.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.pmul.v16i8")]
+        fn vmulq_p8_(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t;
     }
-vhsubq_s8_(a, b)
+vmulq_p8_(a, b)
 }
 
-/// Signed halving subtract
+/// Multiply
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.s16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shsub))]
-pub unsafe fn vhsub_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubs.v4i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.shsub.v4i16")]
-        fn vhsub_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t;
-    }
-vhsub_s16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.f32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul))]
+pub unsafe fn vmul_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    simd_mul(a, b)
 }
 
-/// Signed halving subtract
+/// Multiply
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.s16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shsub))]
-pub unsafe fn vhsubq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubs.v8i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.shsub.v8i16")]
-        fn vhsubq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
-    }
-vhsubq_s16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.f32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul))]
+pub unsafe fn vmulq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    simd_mul(a, b)
 }
 
-/// Signed halving subtract
+/// Vector multiply by scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.s32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shsub))]
-pub unsafe fn vhsub_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubs.v2i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.shsub.v2i32")]
-        fn vhsub_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t;
-    }
-vhsub_s32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+pub unsafe fn vmul_n_s16(a: int16x4_t, b: i16) -> int16x4_t {
+    simd_mul(a, vdup_n_s16(b))
 }
 
-/// Signed halving subtract
+/// Vector multiply by scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.s32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shsub))]
-pub unsafe fn vhsubq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubs.v4i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.shsub.v4i32")]
-        fn vhsubq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
-    }
-vhsubq_s32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+pub unsafe fn vmulq_n_s16(a: int16x8_t, b: i16) -> int16x8_t {
+    simd_mul(a, vdupq_n_s16(b))
 }
 
-/// Signed Subtract Wide
+/// Vector multiply by scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubw))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssubw))]
-pub unsafe fn vsubw_s8(a: int16x8_t, b: int8x8_t) -> int16x8_t {
-    simd_sub(a, simd_cast(b))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+pub unsafe fn vmul_n_s32(a: int32x2_t, b: i32) -> int32x2_t {
+    simd_mul(a, vdup_n_s32(b))
 }
 
-/// Signed Subtract Wide
+/// Vector multiply by scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubw))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssubw))]
-pub unsafe fn vsubw_s16(a: int32x4_t, b: int16x4_t) -> int32x4_t {
-    simd_sub(a, simd_cast(b))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+pub unsafe fn vmulq_n_s32(a: int32x4_t, b: i32) -> int32x4_t {
+    simd_mul(a, vdupq_n_s32(b))
 }
 
-/// Signed Subtract Wide
+/// Vector multiply by scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubw))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssubw))]
-pub unsafe fn vsubw_s32(a: int64x2_t, b: int32x2_t) -> int64x2_t {
-    simd_sub(a, simd_cast(b))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+pub unsafe fn vmul_n_u16(a: uint16x4_t, b: u16) -> uint16x4_t {
+    simd_mul(a, vdup_n_u16(b))
 }
 
-/// Unsigned Subtract Wide
+/// Vector multiply by scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubw))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usubw))]
-pub unsafe fn vsubw_u8(a: uint16x8_t, b: uint8x8_t) -> uint16x8_t {
-    simd_sub(a, simd_cast(b))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+pub unsafe fn vmulq_n_u16(a: uint16x8_t, b: u16) -> uint16x8_t {
+    simd_mul(a, vdupq_n_u16(b))
 }
 
-/// Unsigned Subtract Wide
+/// Vector multiply by scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubw))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usubw))]
-pub unsafe fn vsubw_u16(a: uint32x4_t, b: uint16x4_t) -> uint32x4_t {
-    simd_sub(a, simd_cast(b))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+pub unsafe fn vmul_n_u32(a: uint32x2_t, b: u32) -> uint32x2_t {
+    simd_mul(a, vdup_n_u32(b))
 }
 
-/// Unsigned Subtract Wide
+/// Vector multiply by scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubw))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usubw))]
-pub unsafe fn vsubw_u32(a: uint64x2_t, b: uint32x2_t) -> uint64x2_t {
-    simd_sub(a, simd_cast(b))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+pub unsafe fn vmulq_n_u32(a: uint32x4_t, b: u32) -> uint32x4_t {
+    simd_mul(a, vdupq_n_u32(b))
 }
 
-/// Signed Subtract Long
+/// Vector multiply by scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssubl))]
-pub unsafe fn vsubl_s8(a: int8x8_t, b: int8x8_t) -> int16x8_t {
-    let c: int16x8_t = simd_cast(a);
-    let d: int16x8_t = simd_cast(b);
-    simd_sub(c, d)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul))]
+pub unsafe fn vmul_n_f32(a: float32x2_t, b: f32) -> float32x2_t {
+    simd_mul(a, vdup_n_f32(b))
 }
 
-/// Signed Subtract Long
+/// Vector multiply by scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssubl))]
-pub unsafe fn vsubl_s16(a: int16x4_t, b: int16x4_t) -> int32x4_t {
-    let c: int32x4_t = simd_cast(a);
-    let d: int32x4_t = simd_cast(b);
-    simd_sub(c, d)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul))]
+pub unsafe fn vmulq_n_f32(a: float32x4_t, b: f32) -> float32x4_t {
+    simd_mul(a, vdupq_n_f32(b))
 }
 
-/// Signed Subtract Long
+/// Multiply
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssubl))]
-pub unsafe fn vsubl_s32(a: int32x2_t, b: int32x2_t) -> int64x2_t {
-    let c: int64x2_t = simd_cast(a);
-    let d: int64x2_t = simd_cast(b);
-    simd_sub(c, d)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmul_lane_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    static_assert_imm2!(LANE);
+    simd_mul(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }
 
-/// Unsigned Subtract Long
+/// Multiply
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usubl))]
-pub unsafe fn vsubl_u8(a: uint8x8_t, b: uint8x8_t) -> uint16x8_t {
-    let c: uint16x8_t = simd_cast(a);
-    let d: uint16x8_t = simd_cast(b);
-    simd_sub(c, d)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmul_laneq_s16<const LANE: i32>(a: int16x4_t, b: int16x8_t) -> int16x4_t {
+    static_assert_imm3!(LANE);
+    simd_mul(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }
 
-/// Unsigned Subtract Long
+/// Multiply
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usubl))]
-pub unsafe fn vsubl_u16(a: uint16x4_t, b: uint16x4_t) -> uint32x4_t {
-    let c: uint32x4_t = simd_cast(a);
-    let d: uint32x4_t = simd_cast(b);
-    simd_sub(c, d)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmulq_lane_s16<const LANE: i32>(a: int16x8_t, b: int16x4_t) -> int16x8_t {
+    static_assert_imm2!(LANE);
+    simd_mul(a, simd_shuffle8!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }
 
-/// Unsigned Subtract Long
+/// Multiply
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usubl))]
-pub unsafe fn vsubl_u32(a: uint32x2_t, b: uint32x2_t) -> uint64x2_t {
-    let c: uint64x2_t = simd_cast(a);
-    let d: uint64x2_t = simd_cast(b);
-    simd_sub(c, d)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmulq_laneq_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    static_assert_imm3!(LANE);
+    simd_mul(a, simd_shuffle8!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }
 
-/// Maximum (vector)
+/// Multiply
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smax))]
-pub unsafe fn vmax_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxs.v8i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smax.v8i8")]
-        fn vmax_s8_(a: int8x8_t, b: int8x8_t) -> int8x8_t;
-    }
-vmax_s8_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmul_lane_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    static_assert_imm1!(LANE);
+    simd_mul(a, simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]))
 }
 
-/// Maximum (vector)
+/// Multiply
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smax))]
-pub unsafe fn vmaxq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxs.v16i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smax.v16i8")]
-        fn vmaxq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t;
-    }
-vmaxq_s8_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmul_laneq_s32<const LANE: i32>(a: int32x2_t, b: int32x4_t) -> int32x2_t {
+    static_assert_imm2!(LANE);
+    simd_mul(a, simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]))
 }
 
-/// Maximum (vector)
+/// Multiply
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smax))]
-pub unsafe fn vmax_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxs.v4i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smax.v4i16")]
-        fn vmax_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t;
-    }
-vmax_s16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmulq_lane_s32<const LANE: i32>(a: int32x4_t, b: int32x2_t) -> int32x4_t {
+    static_assert_imm1!(LANE);
+    simd_mul(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }
 
-/// Maximum (vector)
+/// Multiply
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smax))]
-pub unsafe fn vmaxq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxs.v8i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smax.v8i16")]
-        fn vmaxq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
-    }
-vmaxq_s16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmulq_laneq_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    static_assert_imm2!(LANE);
+    simd_mul(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }
 
-/// Maximum (vector)
+/// Multiply
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smax))]
-pub unsafe fn vmax_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxs.v2i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smax.v2i32")]
-        fn vmax_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t;
-    }
-vmax_s32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmul_lane_u16<const LANE: i32>(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    static_assert_imm2!(LANE);
+    simd_mul(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }
 
-/// Maximum (vector)
+/// Multiply
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smax))]
-pub unsafe fn vmaxq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxs.v4i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smax.v4i32")]
-        fn vmaxq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
-    }
-vmaxq_s32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmul_laneq_u16<const LANE: i32>(a: uint16x4_t, b: uint16x8_t) -> uint16x4_t {
+    static_assert_imm3!(LANE);
+    simd_mul(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }
 
-/// Maximum (vector)
+/// Multiply
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umax))]
-pub unsafe fn vmax_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxu.v8i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umax.v8i8")]
-        fn vmax_u8_(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t;
-    }
-vmax_u8_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmulq_lane_u16<const LANE: i32>(a: uint16x8_t, b: uint16x4_t) -> uint16x8_t {
+    static_assert_imm2!(LANE);
+    simd_mul(a, simd_shuffle8!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }
 
-/// Maximum (vector)
+/// Multiply
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umax))]
-pub unsafe fn vmaxq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxu.v16i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umax.v16i8")]
-        fn vmaxq_u8_(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t;
-    }
-vmaxq_u8_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmulq_laneq_u16<const LANE: i32>(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    static_assert_imm3!(LANE);
+    simd_mul(a, simd_shuffle8!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }
 
-/// Maximum (vector)
+/// Multiply
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umax))]
-pub unsafe fn vmax_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxu.v4i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umax.v4i16")]
-        fn vmax_u16_(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t;
-    }
-vmax_u16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmul_lane_u32<const LANE: i32>(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    static_assert_imm1!(LANE);
+    simd_mul(a, simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]))
 }
 
-/// Maximum (vector)
+/// Multiply
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umax))]
-pub unsafe fn vmaxq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxu.v8i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umax.v8i16")]
-        fn vmaxq_u16_(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t;
-    }
-vmaxq_u16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmul_laneq_u32<const LANE: i32>(a: uint32x2_t, b: uint32x4_t) -> uint32x2_t {
+    static_assert_imm2!(LANE);
+    simd_mul(a, simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]))
 }
 
-/// Maximum (vector)
+/// Multiply
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umax))]
-pub unsafe fn vmax_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxu.v2i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umax.v2i32")]
-        fn vmax_u32_(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t;
-    }
-vmax_u32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmulq_lane_u32<const LANE: i32>(a: uint32x4_t, b: uint32x2_t) -> uint32x4_t {
+    static_assert_imm1!(LANE);
+    simd_mul(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }
 
-/// Maximum (vector)
+/// Multiply
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umax))]
-pub unsafe fn vmaxq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxu.v4i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umax.v4i32")]
-        fn vmaxq_u32_(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t;
-    }
-vmaxq_u32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmulq_laneq_u32<const LANE: i32>(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    static_assert_imm2!(LANE);
+    simd_mul(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }
 
-/// Maximum (vector)
+/// Floating-point multiply
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmax))]
-pub unsafe fn vmax_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxs.v2f32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmax.v2f32")]
-        fn vmax_f32_(a: float32x2_t, b: float32x2_t) -> float32x2_t;
-    }
-vmax_f32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmul_lane_f32<const LANE: i32>(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    static_assert_imm1!(LANE);
+    simd_mul(a, simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]))
 }
 
-/// Maximum (vector)
+/// Floating-point multiply
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmax))]
-pub unsafe fn vmaxq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxs.v4f32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmax.v4f32")]
-        fn vmaxq_f32_(a: float32x4_t, b: float32x4_t) -> float32x4_t;
-    }
-vmaxq_f32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmul_laneq_f32<const LANE: i32>(a: float32x2_t, b: float32x4_t) -> float32x2_t {
+    static_assert_imm2!(LANE);
+    simd_mul(a, simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]))
 }
 
-/// Floating-point Maximun Number (vector)
+/// Floating-point multiply
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmaxnm))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmaxnm))]
-pub unsafe fn vmaxnm_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxnm.v2f32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmaxnm.v2f32")]
-        fn vmaxnm_f32_(a: float32x2_t, b: float32x2_t) -> float32x2_t;
-    }
-vmaxnm_f32_(a, b)
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmulq_lane_f32<const LANE: i32>(a: float32x4_t, b: float32x2_t) -> float32x4_t {
+    static_assert_imm1!(LANE);
+    simd_mul(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }
 
-/// Floating-point Maximun Number (vector)
+/// Floating-point multiply
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmaxnm))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmaxnm))]
-pub unsafe fn vmaxnmq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxnm.v4f32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmaxnm.v4f32")]
-        fn vmaxnmq_f32_(a: float32x4_t, b: float32x4_t) -> float32x4_t;
-    }
-vmaxnmq_f32_(a, b)
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmulq_laneq_f32<const LANE: i32>(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    static_assert_imm2!(LANE);
+    simd_mul(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }
 
-/// Minimum (vector)
+/// Signed multiply long
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smin))]
-pub unsafe fn vmin_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmull.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smull))]
+pub unsafe fn vmull_s8(a: int8x8_t, b: int8x8_t) -> int16x8_t {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmins.v8i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smin.v8i8")]
-        fn vmin_s8_(a: int8x8_t, b: int8x8_t) -> int8x8_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmulls.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smull.v8i8")]
+        fn vmull_s8_(a: int8x8_t, b: int8x8_t) -> int16x8_t;
     }
-vmin_s8_(a, b)
+vmull_s8_(a, b)
 }
 
-/// Minimum (vector)
+/// Signed multiply long
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smin))]
-pub unsafe fn vminq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmull.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smull))]
+pub unsafe fn vmull_s16(a: int16x4_t, b: int16x4_t) -> int32x4_t {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmins.v16i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smin.v16i8")]
-        fn vminq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmulls.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smull.v4i16")]
+        fn vmull_s16_(a: int16x4_t, b: int16x4_t) -> int32x4_t;
     }
-vminq_s8_(a, b)
+vmull_s16_(a, b)
 }
 
-/// Minimum (vector)
+/// Signed multiply long
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smin))]
-pub unsafe fn vmin_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmull.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smull))]
+pub unsafe fn vmull_s32(a: int32x2_t, b: int32x2_t) -> int64x2_t {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmins.v4i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smin.v4i16")]
-        fn vmin_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmulls.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smull.v2i32")]
+        fn vmull_s32_(a: int32x2_t, b: int32x2_t) -> int64x2_t;
     }
-vmin_s16_(a, b)
+vmull_s32_(a, b)
 }
 
-/// Minimum (vector)
+/// Unsigned multiply long
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smin))]
-pub unsafe fn vminq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmull.u8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umull))]
+pub unsafe fn vmull_u8(a: uint8x8_t, b: uint8x8_t) -> uint16x8_t {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmins.v8i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smin.v8i16")]
-        fn vminq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmullu.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umull.v8i8")]
+        fn vmull_u8_(a: uint8x8_t, b: uint8x8_t) -> uint16x8_t;
     }
-vminq_s16_(a, b)
+vmull_u8_(a, b)
 }
 
-/// Minimum (vector)
+/// Unsigned multiply long
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smin))]
-pub unsafe fn vmin_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmull.u16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umull))]
+pub unsafe fn vmull_u16(a: uint16x4_t, b: uint16x4_t) -> uint32x4_t {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmins.v2i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smin.v2i32")]
-        fn vmin_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmullu.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umull.v4i16")]
+        fn vmull_u16_(a: uint16x4_t, b: uint16x4_t) -> uint32x4_t;
     }
-vmin_s32_(a, b)
+vmull_u16_(a, b)
 }
 
-/// Minimum (vector)
+/// Unsigned multiply long
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smin))]
-pub unsafe fn vminq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmull.u32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umull))]
+pub unsafe fn vmull_u32(a: uint32x2_t, b: uint32x2_t) -> uint64x2_t {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmins.v4i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smin.v4i32")]
-        fn vminq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmullu.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umull.v2i32")]
+        fn vmull_u32_(a: uint32x2_t, b: uint32x2_t) -> uint64x2_t;
     }
-vminq_s32_(a, b)
+vmull_u32_(a, b)
 }
 
-/// Minimum (vector)
+/// Polynomial multiply long
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umin))]
-pub unsafe fn vmin_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmull.p8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(pmull))]
+pub unsafe fn vmull_p8(a: poly8x8_t, b: poly8x8_t) -> poly16x8_t {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vminu.v8i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umin.v8i8")]
-        fn vmin_u8_(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmullp.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.pmull.v8i8")]
+        fn vmull_p8_(a: poly8x8_t, b: poly8x8_t) -> poly16x8_t;
     }
-vmin_u8_(a, b)
+vmull_p8_(a, b)
 }
 
-/// Minimum (vector)
+/// Vector long multiply with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umin))]
-pub unsafe fn vminq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vminu.v16i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umin.v16i8")]
-        fn vminq_u8_(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t;
-    }
-vminq_u8_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smull))]
+pub unsafe fn vmullh_n_s16(a: int16x4_t, b: i16) -> int32x4_t {
+    vmull_s16(a, vdup_n_s16(b))
 }
 
-/// Minimum (vector)
+/// Vector long multiply with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umin))]
-pub unsafe fn vmin_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vminu.v4i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umin.v4i16")]
-        fn vmin_u16_(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t;
-    }
-vmin_u16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smull))]
+pub unsafe fn vmulls_n_s32(a: int32x2_t, b: i32) -> int64x2_t {
+    vmull_s32(a, vdup_n_s32(b))
 }
 
-/// Minimum (vector)
+/// Vector long multiply with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umin))]
-pub unsafe fn vminq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vminu.v8i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umin.v8i16")]
-        fn vminq_u16_(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t;
-    }
-vminq_u16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umull))]
+pub unsafe fn vmullh_n_u16(a: uint16x4_t, b: u16) -> uint32x4_t {
+    vmull_u16(a, vdup_n_u16(b))
 }
 
-/// Minimum (vector)
+/// Vector long multiply with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umin))]
-pub unsafe fn vmin_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vminu.v2i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umin.v2i32")]
-        fn vmin_u32_(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t;
-    }
-vmin_u32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umull))]
+pub unsafe fn vmulls_n_u32(a: uint32x2_t, b: u32) -> uint64x2_t {
+    vmull_u32(a, vdup_n_u32(b))
 }
 
-/// Minimum (vector)
+/// Vector long multiply by scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umin))]
-pub unsafe fn vminq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vminu.v4i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umin.v4i32")]
-        fn vminq_u32_(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t;
-    }
-vminq_u32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smull, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmull_lane_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t) -> int32x4_t {
+    static_assert_imm2!(LANE);
+    vmull_s16(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }
 
-/// Minimum (vector)
+/// Vector long multiply by scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmin))]
-pub unsafe fn vmin_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmins.v2f32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmin.v2f32")]
-        fn vmin_f32_(a: float32x2_t, b: float32x2_t) -> float32x2_t;
-    }
-vmin_f32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smull, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmull_laneq_s16<const LANE: i32>(a: int16x4_t, b: int16x8_t) -> int32x4_t {
+    static_assert_imm3!(LANE);
+    vmull_s16(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }
 
-/// Minimum (vector)
+/// Vector long multiply by scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmin))]
-pub unsafe fn vminq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmins.v4f32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmin.v4f32")]
-        fn vminq_f32_(a: float32x4_t, b: float32x4_t) -> float32x4_t;
-    }
-vminq_f32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smull, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmull_lane_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t) -> int64x2_t {
+    static_assert_imm1!(LANE);
+    vmull_s32(a, simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]))
 }
 
-/// Floating-point Minimun Number (vector)
+/// Vector long multiply by scalar
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vminnm))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fminnm))]
-pub unsafe fn vminnm_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vminnm.v2f32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fminnm.v2f32")]
-        fn vminnm_f32_(a: float32x2_t, b: float32x2_t) -> float32x2_t;
-    }
-vminnm_f32_(a, b)
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smull, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmull_laneq_s32<const LANE: i32>(a: int32x2_t, b: int32x4_t) -> int64x2_t {
+    static_assert_imm2!(LANE);
+    vmull_s32(a, simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]))
 }
 
-/// Floating-point Minimun Number (vector)
+/// Vector long multiply by scalar
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vminnm))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fminnm))]
-pub unsafe fn vminnmq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vminnm.v4f32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fminnm.v4f32")]
-        fn vminnmq_f32_(a: float32x4_t, b: float32x4_t) -> float32x4_t;
-    }
-vminnmq_f32_(a, b)
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umull, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmull_lane_u16<const LANE: i32>(a: uint16x4_t, b: uint16x4_t) -> uint32x4_t {
+    static_assert_imm2!(LANE);
+    vmull_u16(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }
 
-/// Signed saturating doubling multiply long
+/// Vector long multiply by scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmull))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmull))]
-pub unsafe fn vqdmull_s16(a: int16x4_t, b: int16x4_t) -> int32x4_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqdmull.v4i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqdmull.v4i32")]
-        fn vqdmull_s16_(a: int16x4_t, b: int16x4_t) -> int32x4_t;
-    }
-vqdmull_s16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umull, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmull_laneq_u16<const LANE: i32>(a: uint16x4_t, b: uint16x8_t) -> uint32x4_t {
+    static_assert_imm3!(LANE);
+    vmull_u16(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }
 
-/// Signed saturating doubling multiply long
+/// Vector long multiply by scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmull))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmull))]
-pub unsafe fn vqdmull_s32(a: int32x2_t, b: int32x2_t) -> int64x2_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqdmull.v2i64")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqdmull.v2i64")]
-        fn vqdmull_s32_(a: int32x2_t, b: int32x2_t) -> int64x2_t;
-    }
-vqdmull_s32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umull, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmull_lane_u32<const LANE: i32>(a: uint32x2_t, b: uint32x2_t) -> uint64x2_t {
+    static_assert_imm1!(LANE);
+    vmull_u32(a, simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]))
 }
 
-/// Vector saturating doubling long multiply with scalar
+/// Vector long multiply by scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmull))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmull))]
-pub unsafe fn vqdmull_n_s16(a: int16x4_t, b: i16) -> int32x4_t {
-    vqdmull_s16(a, vdup_n_s16(b))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umull, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmull_laneq_u32<const LANE: i32>(a: uint32x2_t, b: uint32x4_t) -> uint64x2_t {
+    static_assert_imm2!(LANE);
+    vmull_u32(a, simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]))
 }
 
-/// Vector saturating doubling long multiply with scalar
+/// Floating-point fused Multiply-Add to accumulator(vector)
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmull))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmull))]
-pub unsafe fn vqdmull_n_s32(a: int32x2_t, b: i32) -> int64x2_t {
-    vqdmull_s32(a, vdup_n_s32(b))
+#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfma))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmla))]
+pub unsafe fn vfma_f32(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.fma.v2f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.fma.v2f32")]
+        fn vfma_f32_(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t;
+    }
+vfma_f32_(b, c, a)
 }
 
-/// Vector saturating doubling long multiply by scalar
+/// Floating-point fused Multiply-Add to accumulator(vector)
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmull, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmull, N = 2))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vqdmull_lane_s16<const N: i32>(a: int16x4_t, b: int16x4_t) -> int32x4_t {
-    static_assert_imm2!(N);
-    let b: int16x4_t = simd_shuffle4!(b, b, <const N: i32> [N as u32, N as u32, N as u32, N as u32]);
-    vqdmull_s16(a, b)
+#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfma))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmla))]
+pub unsafe fn vfmaq_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.fma.v4f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.fma.v4f32")]
+        fn vfmaq_f32_(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t;
+    }
+vfmaq_f32_(b, c, a)
 }
 
-/// Vector saturating doubling long multiply by scalar
+/// Floating-point fused Multiply-Add to accumulator(vector)
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmull, N = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmull, N = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vqdmull_lane_s32<const N: i32>(a: int32x2_t, b: int32x2_t) -> int64x2_t {
-    static_assert_imm1!(N);
-    let b: int32x2_t = simd_shuffle2!(b, b, <const N: i32> [N as u32, N as u32]);
-    vqdmull_s32(a, b)
+#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfma))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmla))]
+pub unsafe fn vfma_n_f32(a: float32x2_t, b: float32x2_t, c: f32) -> float32x2_t {
+    vfma_f32(a, b, vdup_n_f32(c))
 }
 
-/// Signed saturating doubling multiply-add long
+/// Floating-point fused Multiply-Add to accumulator(vector)
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlal))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlal))]
-pub unsafe fn vqdmlal_s16(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t {
-    vqaddq_s32(a, vqdmull_s16(b, c))
+#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfma))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmla))]
+pub unsafe fn vfmaq_n_f32(a: float32x4_t, b: float32x4_t, c: f32) -> float32x4_t {
+    vfmaq_f32(a, b, vdupq_n_f32(c))
 }
 
-/// Signed saturating doubling multiply-add long
+/// Floating-point fused multiply-subtract from accumulator
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlal))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlal))]
-pub unsafe fn vqdmlal_s32(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t {
-    vqaddq_s64(a, vqdmull_s32(b, c))
-}
+#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfms))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmls))]
+pub unsafe fn vfms_f32(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t {
+    let b: float32x2_t = simd_neg(b);
+    vfma_f32(a, b, c)
+}
 
-/// Vector widening saturating doubling multiply accumulate with scalar
+/// Floating-point fused multiply-subtract from accumulator
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlal))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlal))]
-pub unsafe fn vqdmlal_n_s16(a: int32x4_t, b: int16x4_t, c: i16) -> int32x4_t {
-    vqaddq_s32(a, vqdmull_n_s16(b, c))
+#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfms))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmls))]
+pub unsafe fn vfmsq_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t {
+    let b: float32x4_t = simd_neg(b);
+    vfmaq_f32(a, b, c)
 }
 
-/// Vector widening saturating doubling multiply accumulate with scalar
+/// Floating-point fused Multiply-subtract to accumulator(vector)
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlal))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlal))]
-pub unsafe fn vqdmlal_n_s32(a: int64x2_t, b: int32x2_t, c: i32) -> int64x2_t {
-    vqaddq_s64(a, vqdmull_n_s32(b, c))
+#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfms))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmls))]
+pub unsafe fn vfms_n_f32(a: float32x2_t, b: float32x2_t, c: f32) -> float32x2_t {
+    vfms_f32(a, b, vdup_n_f32(c))
 }
 
-/// Vector widening saturating doubling multiply accumulate with scalar
+/// Floating-point fused Multiply-subtract to accumulator(vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfms))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmls))]
+pub unsafe fn vfmsq_n_f32(a: float32x4_t, b: float32x4_t, c: f32) -> float32x4_t {
+    vfmsq_f32(a, b, vdupq_n_f32(c))
+}
+
+/// Subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlal, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlal, N = 2))]
-#[rustc_legacy_const_generics(3)]
-pub unsafe fn vqdmlal_lane_s16<const N: i32>(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t {
-    static_assert_imm2!(N);
-    vqaddq_s32(a, vqdmull_lane_s16::<N>(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
+pub unsafe fn vsub_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    simd_sub(a, b)
 }
 
-/// Vector widening saturating doubling multiply accumulate with scalar
+/// Subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlal, N = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlal, N = 1))]
-#[rustc_legacy_const_generics(3)]
-pub unsafe fn vqdmlal_lane_s32<const N: i32>(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t {
-    static_assert_imm1!(N);
-    vqaddq_s64(a, vqdmull_lane_s32::<N>(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
+pub unsafe fn vsubq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    simd_sub(a, b)
 }
 
-/// Signed saturating doubling multiply-subtract long
+/// Subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlsl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlsl))]
-pub unsafe fn vqdmlsl_s16(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t {
-    vqsubq_s32(a, vqdmull_s16(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
+pub unsafe fn vsub_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    simd_sub(a, b)
 }
 
-/// Signed saturating doubling multiply-subtract long
+/// Subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlsl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlsl))]
-pub unsafe fn vqdmlsl_s32(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t {
-    vqsubq_s64(a, vqdmull_s32(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
+pub unsafe fn vsubq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    simd_sub(a, b)
 }
 
-/// Vector widening saturating doubling multiply subtract with scalar
+/// Subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlsl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlsl))]
-pub unsafe fn vqdmlsl_n_s16(a: int32x4_t, b: int16x4_t, c: i16) -> int32x4_t {
-    vqsubq_s32(a, vqdmull_n_s16(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
+pub unsafe fn vsub_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    simd_sub(a, b)
 }
 
-/// Vector widening saturating doubling multiply subtract with scalar
+/// Subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlsl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlsl))]
-pub unsafe fn vqdmlsl_n_s32(a: int64x2_t, b: int32x2_t, c: i32) -> int64x2_t {
-    vqsubq_s64(a, vqdmull_n_s32(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
+pub unsafe fn vsubq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    simd_sub(a, b)
 }
 
-/// Vector widening saturating doubling multiply subtract with scalar
+/// Subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlsl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlsl, N = 2))]
-#[rustc_legacy_const_generics(3)]
-pub unsafe fn vqdmlsl_lane_s16<const N: i32>(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t {
-    static_assert_imm2!(N);
-    vqsubq_s32(a, vqdmull_lane_s16::<N>(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
+pub unsafe fn vsub_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    simd_sub(a, b)
 }
 
-/// Vector widening saturating doubling multiply subtract with scalar
+/// Subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlsl, N = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlsl, N = 1))]
-#[rustc_legacy_const_generics(3)]
-pub unsafe fn vqdmlsl_lane_s32<const N: i32>(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t {
-    static_assert_imm1!(N);
-    vqsubq_s64(a, vqdmull_lane_s32::<N>(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
+pub unsafe fn vsubq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    simd_sub(a, b)
 }
 
-/// Signed saturating doubling multiply returning high half
+/// Subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmulh))]
-pub unsafe fn vqdmulh_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqdmulh.v4i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqdmulh.v4i16")]
-        fn vqdmulh_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t;
-    }
-vqdmulh_s16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
+pub unsafe fn vsub_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    simd_sub(a, b)
 }
 
-/// Signed saturating doubling multiply returning high half
+/// Subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmulh))]
-pub unsafe fn vqdmulhq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqdmulh.v8i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqdmulh.v8i16")]
-        fn vqdmulhq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
-    }
-vqdmulhq_s16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
+pub unsafe fn vsubq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    simd_sub(a, b)
 }
 
-/// Signed saturating doubling multiply returning high half
+/// Subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmulh))]
-pub unsafe fn vqdmulh_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqdmulh.v2i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqdmulh.v2i32")]
-        fn vqdmulh_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t;
-    }
-vqdmulh_s32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
+pub unsafe fn vsub_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    simd_sub(a, b)
 }
 
-/// Signed saturating doubling multiply returning high half
+/// Subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmulh))]
-pub unsafe fn vqdmulhq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqdmulh.v4i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqdmulh.v4i32")]
-        fn vqdmulhq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
-    }
-vqdmulhq_s32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
+pub unsafe fn vsubq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    simd_sub(a, b)
 }
 
-/// Vector saturating doubling multiply high with scalar
+/// Subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmulh))]
-pub unsafe fn vqdmulh_n_s16(a: int16x4_t, b: i16) -> int16x4_t {
-    let b: int16x4_t = vdup_n_s16(b);
-    vqdmulh_s16(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i64"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
+pub unsafe fn vsub_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    simd_sub(a, b)
 }
 
-/// Vector saturating doubling multiply high with scalar
+/// Subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmulh))]
-pub unsafe fn vqdmulh_n_s32(a: int32x2_t, b: i32) -> int32x2_t {
-    let b: int32x2_t = vdup_n_s32(b);
-    vqdmulh_s32(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i64"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
+pub unsafe fn vsubq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    simd_sub(a, b)
 }
 
-/// Vector saturating doubling multiply high with scalar
+/// Subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmulh))]
-pub unsafe fn vqdmulhq_nq_s16(a: int16x8_t, b: i16) -> int16x8_t {
-    let b: int16x8_t = vdupq_n_s16(b);
-    vqdmulhq_s16(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i64"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
+pub unsafe fn vsub_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    simd_sub(a, b)
 }
 
-/// Vector saturating doubling multiply high with scalar
+/// Subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmulh))]
-pub unsafe fn vqdmulhq_nq_s32(a: int32x4_t, b: i32) -> int32x4_t {
-    let b: int32x4_t = vdupq_n_s32(b);
-    vqdmulhq_s32(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i64"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
+pub unsafe fn vsubq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    simd_sub(a, b)
 }
 
-/// Signed saturating extract narrow
+/// Subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqmovn))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqxtn))]
-pub unsafe fn vqmovn_s16(a: int16x8_t) -> int8x8_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqmovns.v8i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqxtn.v8i8")]
-        fn vqmovn_s16_(a: int16x8_t) -> int8x8_t;
-    }
-vqmovn_s16_(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.f32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fsub))]
+pub unsafe fn vsub_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    simd_sub(a, b)
 }
 
-/// Signed saturating extract narrow
+/// Subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqmovn))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqxtn))]
-pub unsafe fn vqmovn_s32(a: int32x4_t) -> int16x4_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqmovns.v4i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqxtn.v4i16")]
-        fn vqmovn_s32_(a: int32x4_t) -> int16x4_t;
-    }
-vqmovn_s32_(a)
-}
-
-/// Signed saturating extract narrow
-#[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqmovn))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqxtn))]
-pub unsafe fn vqmovn_s64(a: int64x2_t) -> int32x2_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqmovns.v2i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqxtn.v2i32")]
-        fn vqmovn_s64_(a: int64x2_t) -> int32x2_t;
-    }
-vqmovn_s64_(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.f32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fsub))]
+pub unsafe fn vsubq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    simd_sub(a, b)
 }
 
-/// Unsigned saturating extract narrow
+/// Subtract returning high narrow
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqmovn))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqxtn))]
-pub unsafe fn vqmovn_u16(a: uint16x8_t) -> uint8x8_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqmovnu.v8i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqxtn.v8i8")]
-        fn vqmovn_u16_(a: uint16x8_t) -> uint8x8_t;
-    }
-vqmovn_u16_(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn))]
+pub unsafe fn vsubhn_s16(a: int16x8_t, b: int16x8_t) -> int8x8_t {
+    let c: i16x8 = i16x8::new(8, 8, 8, 8, 8, 8, 8, 8);
+    simd_cast(simd_shr(simd_sub(a, b), transmute(c)))
 }
 
-/// Unsigned saturating extract narrow
+/// Subtract returning high narrow
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqmovn))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqxtn))]
-pub unsafe fn vqmovn_u32(a: uint32x4_t) -> uint16x4_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqmovnu.v4i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqxtn.v4i16")]
-        fn vqmovn_u32_(a: uint32x4_t) -> uint16x4_t;
-    }
-vqmovn_u32_(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn))]
+pub unsafe fn vsubhn_s32(a: int32x4_t, b: int32x4_t) -> int16x4_t {
+    let c: i32x4 = i32x4::new(16, 16, 16, 16);
+    simd_cast(simd_shr(simd_sub(a, b), transmute(c)))
 }
 
-/// Unsigned saturating extract narrow
+/// Subtract returning high narrow
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqmovn))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqxtn))]
-pub unsafe fn vqmovn_u64(a: uint64x2_t) -> uint32x2_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqmovnu.v2i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqxtn.v2i32")]
-        fn vqmovn_u64_(a: uint64x2_t) -> uint32x2_t;
-    }
-vqmovn_u64_(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn))]
+pub unsafe fn vsubhn_s64(a: int64x2_t, b: int64x2_t) -> int32x2_t {
+    let c: i64x2 = i64x2::new(32, 32);
+    simd_cast(simd_shr(simd_sub(a, b), transmute(c)))
 }
 
-/// Signed saturating extract unsigned narrow
+/// Subtract returning high narrow
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqmovun))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqxtun))]
-pub unsafe fn vqmovun_s16(a: int16x8_t) -> uint8x8_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqmovnsu.v8i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqxtun.v8i8")]
-        fn vqmovun_s16_(a: int16x8_t) -> uint8x8_t;
-    }
-vqmovun_s16_(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn))]
+pub unsafe fn vsubhn_u16(a: uint16x8_t, b: uint16x8_t) -> uint8x8_t {
+    let c: u16x8 = u16x8::new(8, 8, 8, 8, 8, 8, 8, 8);
+    simd_cast(simd_shr(simd_sub(a, b), transmute(c)))
 }
 
-/// Signed saturating extract unsigned narrow
+/// Subtract returning high narrow
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqmovun))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqxtun))]
-pub unsafe fn vqmovun_s32(a: int32x4_t) -> uint16x4_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqmovnsu.v4i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqxtun.v4i16")]
-        fn vqmovun_s32_(a: int32x4_t) -> uint16x4_t;
-    }
-vqmovun_s32_(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn))]
+pub unsafe fn vsubhn_u32(a: uint32x4_t, b: uint32x4_t) -> uint16x4_t {
+    let c: u32x4 = u32x4::new(16, 16, 16, 16);
+    simd_cast(simd_shr(simd_sub(a, b), transmute(c)))
 }
 
-/// Signed saturating extract unsigned narrow
+/// Subtract returning high narrow
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqmovun))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqxtun))]
-pub unsafe fn vqmovun_s64(a: int64x2_t) -> uint32x2_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqmovnsu.v2i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqxtun.v2i32")]
-        fn vqmovun_s64_(a: int64x2_t) -> uint32x2_t;
-    }
-vqmovun_s64_(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn))]
+pub unsafe fn vsubhn_u64(a: uint64x2_t, b: uint64x2_t) -> uint32x2_t {
+    let c: u64x2 = u64x2::new(32, 32);
+    simd_cast(simd_shr(simd_sub(a, b), transmute(c)))
 }
 
-/// Signed saturating rounding doubling multiply returning high half
+/// Subtract returning high narrow
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh))]
-pub unsafe fn vqrdmulh_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrdmulh.v4i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrdmulh.v4i16")]
-        fn vqrdmulh_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t;
-    }
-vqrdmulh_s16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn2))]
+pub unsafe fn vsubhn_high_s16(a: int8x8_t, b: int16x8_t, c: int16x8_t) -> int8x16_t {
+    let d: int8x8_t = vsubhn_s16(b, c);
+    simd_shuffle16!(a, d, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
 }
 
-/// Signed saturating rounding doubling multiply returning high half
+/// Subtract returning high narrow
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh))]
-pub unsafe fn vqrdmulhq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrdmulh.v8i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrdmulh.v8i16")]
-        fn vqrdmulhq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
-    }
-vqrdmulhq_s16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn2))]
+pub unsafe fn vsubhn_high_s32(a: int16x4_t, b: int32x4_t, c: int32x4_t) -> int16x8_t {
+    let d: int16x4_t = vsubhn_s32(b, c);
+    simd_shuffle8!(a, d, [0, 1, 2, 3, 4, 5, 6, 7])
 }
 
-/// Signed saturating rounding doubling multiply returning high half
+/// Subtract returning high narrow
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh))]
-pub unsafe fn vqrdmulh_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrdmulh.v2i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrdmulh.v2i32")]
-        fn vqrdmulh_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t;
-    }
-vqrdmulh_s32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn2))]
+pub unsafe fn vsubhn_high_s64(a: int32x2_t, b: int64x2_t, c: int64x2_t) -> int32x4_t {
+    let d: int32x2_t = vsubhn_s64(b, c);
+    simd_shuffle4!(a, d, [0, 1, 2, 3])
 }
 
-/// Signed saturating rounding doubling multiply returning high half
+/// Subtract returning high narrow
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh))]
-pub unsafe fn vqrdmulhq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrdmulh.v4i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrdmulh.v4i32")]
-        fn vqrdmulhq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
-    }
-vqrdmulhq_s32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn2))]
+pub unsafe fn vsubhn_high_u16(a: uint8x8_t, b: uint16x8_t, c: uint16x8_t) -> uint8x16_t {
+    let d: uint8x8_t = vsubhn_u16(b, c);
+    simd_shuffle16!(a, d, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
 }
 
-/// Vector saturating rounding doubling multiply high with scalar
+/// Subtract returning high narrow
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh))]
-pub unsafe fn vqrdmulh_n_s16(a: int16x4_t, b: i16) -> int16x4_t {
-    vqrdmulh_s16(a, vdup_n_s16(b))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn2))]
+pub unsafe fn vsubhn_high_u32(a: uint16x4_t, b: uint32x4_t, c: uint32x4_t) -> uint16x8_t {
+    let d: uint16x4_t = vsubhn_u32(b, c);
+    simd_shuffle8!(a, d, [0, 1, 2, 3, 4, 5, 6, 7])
 }
 
-/// Vector saturating rounding doubling multiply high with scalar
+/// Subtract returning high narrow
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh))]
-pub unsafe fn vqrdmulhq_n_s16(a: int16x8_t, b: i16) -> int16x8_t {
-    vqrdmulhq_s16(a, vdupq_n_s16(b))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn2))]
+pub unsafe fn vsubhn_high_u64(a: uint32x2_t, b: uint64x2_t, c: uint64x2_t) -> uint32x4_t {
+    let d: uint32x2_t = vsubhn_u64(b, c);
+    simd_shuffle4!(a, d, [0, 1, 2, 3])
 }
 
-/// Vector saturating rounding doubling multiply high with scalar
+/// Signed halving subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh))]
-pub unsafe fn vqrdmulh_n_s32(a: int32x2_t, b: i32) -> int32x2_t {
-    vqrdmulh_s32(a, vdup_n_s32(b))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.u8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uhsub))]
+pub unsafe fn vhsub_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubu.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uhsub.v8i8")]
+        fn vhsub_u8_(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t;
+    }
+vhsub_u8_(a, b)
 }
 
-/// Vector saturating rounding doubling multiply high with scalar
+/// Signed halving subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh))]
-pub unsafe fn vqrdmulhq_n_s32(a: int32x4_t, b: i32) -> int32x4_t {
-    vqrdmulhq_s32(a, vdupq_n_s32(b))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.u8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uhsub))]
+pub unsafe fn vhsubq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubu.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uhsub.v16i8")]
+        fn vhsubq_u8_(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t;
+    }
+vhsubq_u8_(a, b)
 }
 
-/// Vector rounding saturating doubling multiply high by scalar
+/// Signed halving subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vqrdmulh_lane_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t) -> int16x4_t {
-    static_assert_imm2!(LANE);
-    let b: int16x4_t = simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
-    vqrdmulh_s16(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.u16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uhsub))]
+pub unsafe fn vhsub_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubu.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uhsub.v4i16")]
+        fn vhsub_u16_(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t;
+    }
+vhsub_u16_(a, b)
 }
 
-/// Vector rounding saturating doubling multiply high by scalar
+/// Signed halving subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vqrdmulh_laneq_s16<const LANE: i32>(a: int16x4_t, b: int16x8_t) -> int16x4_t {
-    static_assert_imm3!(LANE);
-    let b: int16x4_t = simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
-    vqrdmulh_s16(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.u16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uhsub))]
+pub unsafe fn vhsubq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubu.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uhsub.v8i16")]
+        fn vhsubq_u16_(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t;
+    }
+vhsubq_u16_(a, b)
 }
 
-/// Vector rounding saturating doubling multiply high by scalar
+/// Signed halving subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vqrdmulhq_lane_s16<const LANE: i32>(a: int16x8_t, b: int16x4_t) -> int16x8_t {
-    static_assert_imm2!(LANE);
-    let b: int16x8_t = simd_shuffle8!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
-    vqrdmulhq_s16(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.u32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uhsub))]
+pub unsafe fn vhsub_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubu.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uhsub.v2i32")]
+        fn vhsub_u32_(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t;
+    }
+vhsub_u32_(a, b)
 }
 
-/// Vector rounding saturating doubling multiply high by scalar
+/// Signed halving subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vqrdmulhq_laneq_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t) -> int16x8_t {
-    static_assert_imm3!(LANE);
-    let b: int16x8_t = simd_shuffle8!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
-    vqrdmulhq_s16(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.u32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uhsub))]
+pub unsafe fn vhsubq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubu.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uhsub.v4i32")]
+        fn vhsubq_u32_(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t;
+    }
+vhsubq_u32_(a, b)
 }
 
-/// Vector rounding saturating doubling multiply high by scalar
+/// Signed halving subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vqrdmulh_lane_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t) -> int32x2_t {
-    static_assert_imm1!(LANE);
-    let b: int32x2_t = simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]);
-    vqrdmulh_s32(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shsub))]
+pub unsafe fn vhsub_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubs.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.shsub.v8i8")]
+        fn vhsub_s8_(a: int8x8_t, b: int8x8_t) -> int8x8_t;
+    }
+vhsub_s8_(a, b)
 }
 
-/// Vector rounding saturating doubling multiply high by scalar
+/// Signed halving subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vqrdmulh_laneq_s32<const LANE: i32>(a: int32x2_t, b: int32x4_t) -> int32x2_t {
-    static_assert_imm2!(LANE);
-    let b: int32x2_t = simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]);
-    vqrdmulh_s32(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shsub))]
+pub unsafe fn vhsubq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubs.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.shsub.v16i8")]
+        fn vhsubq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t;
+    }
+vhsubq_s8_(a, b)
 }
 
-/// Vector rounding saturating doubling multiply high by scalar
+/// Signed halving subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vqrdmulhq_lane_s32<const LANE: i32>(a: int32x4_t, b: int32x2_t) -> int32x4_t {
-    static_assert_imm1!(LANE);
-    let b: int32x4_t = simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
-    vqrdmulhq_s32(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shsub))]
+pub unsafe fn vhsub_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubs.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.shsub.v4i16")]
+        fn vhsub_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t;
+    }
+vhsub_s16_(a, b)
 }
 
-/// Vector rounding saturating doubling multiply high by scalar
+/// Signed halving subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vqrdmulhq_laneq_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t) -> int32x4_t {
-    static_assert_imm2!(LANE);
-    let b: int32x4_t = simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
-    vqrdmulhq_s32(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shsub))]
+pub unsafe fn vhsubq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubs.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.shsub.v8i16")]
+        fn vhsubq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
+    }
+vhsubq_s16_(a, b)
 }
 
-/// Signed saturating rounding doubling multiply accumulate returning high half
+/// Signed halving subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh))]
-pub unsafe fn vqrdmlah_s16(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t {
-    vqadd_s16(a, vqrdmulh_s16(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shsub))]
+pub unsafe fn vhsub_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubs.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.shsub.v2i32")]
+        fn vhsub_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t;
+    }
+vhsub_s32_(a, b)
 }
 
-/// Signed saturating rounding doubling multiply accumulate returning high half
+/// Signed halving subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh))]
-pub unsafe fn vqrdmlahq_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t {
-    vqaddq_s16(a, vqrdmulhq_s16(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shsub))]
+pub unsafe fn vhsubq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubs.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.shsub.v4i32")]
+        fn vhsubq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+    }
+vhsubq_s32_(a, b)
 }
 
-/// Signed saturating rounding doubling multiply accumulate returning high half
+/// Signed Subtract Wide
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh))]
-pub unsafe fn vqrdmlah_s32(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t {
-    vqadd_s32(a, vqrdmulh_s32(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubw))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssubw))]
+pub unsafe fn vsubw_s8(a: int16x8_t, b: int8x8_t) -> int16x8_t {
+    simd_sub(a, simd_cast(b))
 }
 
-/// Signed saturating rounding doubling multiply accumulate returning high half
+/// Signed Subtract Wide
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh))]
-pub unsafe fn vqrdmlahq_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t {
-    vqaddq_s32(a, vqrdmulhq_s32(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubw))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssubw))]
+pub unsafe fn vsubw_s16(a: int32x4_t, b: int16x4_t) -> int32x4_t {
+    simd_sub(a, simd_cast(b))
 }
 
-/// Signed saturating rounding doubling multiply accumulate returning high half
+/// Signed Subtract Wide
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))]
-#[rustc_legacy_const_generics(3)]
-pub unsafe fn vqrdmlah_lane_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t {
-    static_assert_imm2!(LANE);
-    vqadd_s16(a, vqrdmulh_lane_s16::<LANE>(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubw))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssubw))]
+pub unsafe fn vsubw_s32(a: int64x2_t, b: int32x2_t) -> int64x2_t {
+    simd_sub(a, simd_cast(b))
 }
 
-/// Signed saturating rounding doubling multiply accumulate returning high half
+/// Unsigned Subtract Wide
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))]
-#[rustc_legacy_const_generics(3)]
-pub unsafe fn vqrdmlah_laneq_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t, c: int16x8_t) -> int16x4_t {
-    static_assert_imm3!(LANE);
-    vqadd_s16(a, vqrdmulh_laneq_s16::<LANE>(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubw))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usubw))]
+pub unsafe fn vsubw_u8(a: uint16x8_t, b: uint8x8_t) -> uint16x8_t {
+    simd_sub(a, simd_cast(b))
 }
 
-/// Signed saturating rounding doubling multiply accumulate returning high half
+/// Unsigned Subtract Wide
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))]
-#[rustc_legacy_const_generics(3)]
-pub unsafe fn vqrdmlahq_lane_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t, c: int16x4_t) -> int16x8_t {
-    static_assert_imm2!(LANE);
-    vqaddq_s16(a, vqrdmulhq_lane_s16::<LANE>(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubw))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usubw))]
+pub unsafe fn vsubw_u16(a: uint32x4_t, b: uint16x4_t) -> uint32x4_t {
+    simd_sub(a, simd_cast(b))
 }
 
-/// Signed saturating rounding doubling multiply accumulate returning high half
+/// Unsigned Subtract Wide
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))]
-#[rustc_legacy_const_generics(3)]
-pub unsafe fn vqrdmlahq_laneq_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t {
-    static_assert_imm3!(LANE);
-    vqaddq_s16(a, vqrdmulhq_laneq_s16::<LANE>(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubw))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usubw))]
+pub unsafe fn vsubw_u32(a: uint64x2_t, b: uint32x2_t) -> uint64x2_t {
+    simd_sub(a, simd_cast(b))
 }
 
-/// Signed saturating rounding doubling multiply accumulate returning high half
+/// Signed Subtract Long
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))]
-#[rustc_legacy_const_generics(3)]
-pub unsafe fn vqrdmlah_lane_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t {
-    static_assert_imm1!(LANE);
-    vqadd_s32(a, vqrdmulh_lane_s32::<LANE>(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssubl))]
+pub unsafe fn vsubl_s8(a: int8x8_t, b: int8x8_t) -> int16x8_t {
+    let c: int16x8_t = simd_cast(a);
+    let d: int16x8_t = simd_cast(b);
+    simd_sub(c, d)
 }
 
-/// Signed saturating rounding doubling multiply accumulate returning high half
+/// Signed Subtract Long
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))]
-#[rustc_legacy_const_generics(3)]
-pub unsafe fn vqrdmlah_laneq_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t, c: int32x4_t) -> int32x2_t {
-    static_assert_imm2!(LANE);
-    vqadd_s32(a, vqrdmulh_laneq_s32::<LANE>(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssubl))]
+pub unsafe fn vsubl_s16(a: int16x4_t, b: int16x4_t) -> int32x4_t {
+    let c: int32x4_t = simd_cast(a);
+    let d: int32x4_t = simd_cast(b);
+    simd_sub(c, d)
 }
 
-/// Signed saturating rounding doubling multiply accumulate returning high half
+/// Signed Subtract Long
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))]
-#[rustc_legacy_const_generics(3)]
-pub unsafe fn vqrdmlahq_lane_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t, c: int32x2_t) -> int32x4_t {
-    static_assert_imm1!(LANE);
-    vqaddq_s32(a, vqrdmulhq_lane_s32::<LANE>(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssubl))]
+pub unsafe fn vsubl_s32(a: int32x2_t, b: int32x2_t) -> int64x2_t {
+    let c: int64x2_t = simd_cast(a);
+    let d: int64x2_t = simd_cast(b);
+    simd_sub(c, d)
 }
 
-/// Signed saturating rounding doubling multiply accumulate returning high half
+/// Unsigned Subtract Long
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))]
-#[rustc_legacy_const_generics(3)]
-pub unsafe fn vqrdmlahq_laneq_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t {
-    static_assert_imm2!(LANE);
-    vqaddq_s32(a, vqrdmulhq_laneq_s32::<LANE>(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usubl))]
+pub unsafe fn vsubl_u8(a: uint8x8_t, b: uint8x8_t) -> uint16x8_t {
+    let c: uint16x8_t = simd_cast(a);
+    let d: uint16x8_t = simd_cast(b);
+    simd_sub(c, d)
 }
 
-/// Signed saturating rounding doubling multiply subtract returning high half
+/// Unsigned Subtract Long
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh))]
-pub unsafe fn vqrdmlsh_s16(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t {
-    vqsub_s16(a, vqrdmulh_s16(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usubl))]
+pub unsafe fn vsubl_u16(a: uint16x4_t, b: uint16x4_t) -> uint32x4_t {
+    let c: uint32x4_t = simd_cast(a);
+    let d: uint32x4_t = simd_cast(b);
+    simd_sub(c, d)
 }
 
-/// Signed saturating rounding doubling multiply subtract returning high half
+/// Unsigned Subtract Long
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh))]
-pub unsafe fn vqrdmlshq_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t {
-    vqsubq_s16(a, vqrdmulhq_s16(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usubl))]
+pub unsafe fn vsubl_u32(a: uint32x2_t, b: uint32x2_t) -> uint64x2_t {
+    let c: uint64x2_t = simd_cast(a);
+    let d: uint64x2_t = simd_cast(b);
+    simd_sub(c, d)
 }
 
-/// Signed saturating rounding doubling multiply subtract returning high half
+/// Maximum (vector)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh))]
-pub unsafe fn vqrdmlsh_s32(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t {
-    vqsub_s32(a, vqrdmulh_s32(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smax))]
+pub unsafe fn vmax_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxs.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smax.v8i8")]
+        fn vmax_s8_(a: int8x8_t, b: int8x8_t) -> int8x8_t;
+    }
+vmax_s8_(a, b)
 }
 
-/// Signed saturating rounding doubling multiply subtract returning high half
+/// Maximum (vector)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh))]
-pub unsafe fn vqrdmlshq_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t {
-    vqsubq_s32(a, vqrdmulhq_s32(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smax))]
+pub unsafe fn vmaxq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxs.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smax.v16i8")]
+        fn vmaxq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t;
+    }
+vmaxq_s8_(a, b)
 }
 
-/// Signed saturating rounding doubling multiply subtract returning high half
+/// Maximum (vector)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))]
-#[rustc_legacy_const_generics(3)]
-pub unsafe fn vqrdmlsh_lane_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t {
-    static_assert_imm2!(LANE);
-    vqsub_s16(a, vqrdmulh_lane_s16::<LANE>(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smax))]
+pub unsafe fn vmax_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxs.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smax.v4i16")]
+        fn vmax_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t;
+    }
+vmax_s16_(a, b)
 }
 
-/// Signed saturating rounding doubling multiply subtract returning high half
+/// Maximum (vector)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))]
-#[rustc_legacy_const_generics(3)]
-pub unsafe fn vqrdmlsh_laneq_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t, c: int16x8_t) -> int16x4_t {
-    static_assert_imm3!(LANE);
-    vqsub_s16(a, vqrdmulh_laneq_s16::<LANE>(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smax))]
+pub unsafe fn vmaxq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxs.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smax.v8i16")]
+        fn vmaxq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
+    }
+vmaxq_s16_(a, b)
 }
 
-/// Signed saturating rounding doubling multiply subtract returning high half
+/// Maximum (vector)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))]
-#[rustc_legacy_const_generics(3)]
-pub unsafe fn vqrdmlshq_lane_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t, c: int16x4_t) -> int16x8_t {
-    static_assert_imm2!(LANE);
-    vqsubq_s16(a, vqrdmulhq_lane_s16::<LANE>(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smax))]
+pub unsafe fn vmax_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxs.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smax.v2i32")]
+        fn vmax_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t;
+    }
+vmax_s32_(a, b)
 }
 
-/// Signed saturating rounding doubling multiply subtract returning high half
+/// Maximum (vector)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))]
-#[rustc_legacy_const_generics(3)]
-pub unsafe fn vqrdmlshq_laneq_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t {
-    static_assert_imm3!(LANE);
-    vqsubq_s16(a, vqrdmulhq_laneq_s16::<LANE>(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smax))]
+pub unsafe fn vmaxq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxs.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smax.v4i32")]
+        fn vmaxq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+    }
+vmaxq_s32_(a, b)
 }
 
-/// Signed saturating rounding doubling multiply subtract returning high half
+/// Maximum (vector)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))]
-#[rustc_legacy_const_generics(3)]
-pub unsafe fn vqrdmlsh_lane_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t {
-    static_assert_imm1!(LANE);
-    vqsub_s32(a, vqrdmulh_lane_s32::<LANE>(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umax))]
+pub unsafe fn vmax_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxu.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umax.v8i8")]
+        fn vmax_u8_(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t;
+    }
+vmax_u8_(a, b)
 }
 
-/// Signed saturating rounding doubling multiply subtract returning high half
+/// Maximum (vector)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))]
-#[rustc_legacy_const_generics(3)]
-pub unsafe fn vqrdmlsh_laneq_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t, c: int32x4_t) -> int32x2_t {
-    static_assert_imm2!(LANE);
-    vqsub_s32(a, vqrdmulh_laneq_s32::<LANE>(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umax))]
+pub unsafe fn vmaxq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxu.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umax.v16i8")]
+        fn vmaxq_u8_(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t;
+    }
+vmaxq_u8_(a, b)
 }
 
-/// Signed saturating rounding doubling multiply subtract returning high half
+/// Maximum (vector)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))]
-#[rustc_legacy_const_generics(3)]
-pub unsafe fn vqrdmlshq_lane_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t, c: int32x2_t) -> int32x4_t {
-    static_assert_imm1!(LANE);
-    vqsubq_s32(a, vqrdmulhq_lane_s32::<LANE>(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umax))]
+pub unsafe fn vmax_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxu.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umax.v4i16")]
+        fn vmax_u16_(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t;
+    }
+vmax_u16_(a, b)
 }
 
-/// Signed saturating rounding doubling multiply subtract returning high half
+/// Maximum (vector)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))]
-#[rustc_legacy_const_generics(3)]
-pub unsafe fn vqrdmlshq_laneq_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t {
-    static_assert_imm2!(LANE);
-    vqsubq_s32(a, vqrdmulhq_laneq_s32::<LANE>(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umax))]
+pub unsafe fn vmaxq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxu.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umax.v8i16")]
+        fn vmaxq_u16_(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t;
+    }
+vmaxq_u16_(a, b)
 }
 
-/// Signed saturating rounding shift left
+/// Maximum (vector)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshl))]
-pub unsafe fn vqrshl_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
-    #[allow(improper_ctypes)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umax))]
+pub unsafe fn vmax_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshifts.v8i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshl.v8i8")]
-        fn vqrshl_s8_(a: int8x8_t, b: int8x8_t) -> int8x8_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxu.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umax.v2i32")]
+        fn vmax_u32_(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t;
     }
-vqrshl_s8_(a, b)
+vmax_u32_(a, b)
 }
 
-/// Signed saturating rounding shift left
+/// Maximum (vector)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshl))]
-pub unsafe fn vqrshlq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umax))]
+pub unsafe fn vmaxq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshifts.v16i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshl.v16i8")]
-        fn vqrshlq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxu.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umax.v4i32")]
+        fn vmaxq_u32_(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t;
     }
-vqrshlq_s8_(a, b)
+vmaxq_u32_(a, b)
 }
 
-/// Signed saturating rounding shift left
+/// Maximum (vector)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshl))]
-pub unsafe fn vqrshl_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmax))]
+pub unsafe fn vmax_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshifts.v4i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshl.v4i16")]
-        fn vqrshl_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxs.v2f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmax.v2f32")]
+        fn vmax_f32_(a: float32x2_t, b: float32x2_t) -> float32x2_t;
     }
-vqrshl_s16_(a, b)
+vmax_f32_(a, b)
 }
 
-/// Signed saturating rounding shift left
+/// Maximum (vector)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshl))]
-pub unsafe fn vqrshlq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmax))]
+pub unsafe fn vmaxq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshifts.v8i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshl.v8i16")]
-        fn vqrshlq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxs.v4f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmax.v4f32")]
+        fn vmaxq_f32_(a: float32x4_t, b: float32x4_t) -> float32x4_t;
     }
-vqrshlq_s16_(a, b)
+vmaxq_f32_(a, b)
 }
 
-/// Signed saturating rounding shift left
+/// Floating-point Maximun Number (vector)
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshl))]
-pub unsafe fn vqrshl_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmaxnm))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmaxnm))]
+pub unsafe fn vmaxnm_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshifts.v2i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshl.v2i32")]
-        fn vqrshl_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxnm.v2f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmaxnm.v2f32")]
+        fn vmaxnm_f32_(a: float32x2_t, b: float32x2_t) -> float32x2_t;
     }
-vqrshl_s32_(a, b)
+vmaxnm_f32_(a, b)
 }
 
-/// Signed saturating rounding shift left
+/// Floating-point Maximun Number (vector)
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshl))]
-pub unsafe fn vqrshlq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmaxnm))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmaxnm))]
+pub unsafe fn vmaxnmq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshifts.v4i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshl.v4i32")]
-        fn vqrshlq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxnm.v4f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmaxnm.v4f32")]
+        fn vmaxnmq_f32_(a: float32x4_t, b: float32x4_t) -> float32x4_t;
     }
-vqrshlq_s32_(a, b)
+vmaxnmq_f32_(a, b)
 }
 
-/// Signed saturating rounding shift left
+/// Minimum (vector)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshl))]
-pub unsafe fn vqrshl_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smin))]
+pub unsafe fn vmin_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshifts.v1i64")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshl.v1i64")]
-        fn vqrshl_s64_(a: int64x1_t, b: int64x1_t) -> int64x1_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmins.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smin.v8i8")]
+        fn vmin_s8_(a: int8x8_t, b: int8x8_t) -> int8x8_t;
     }
-vqrshl_s64_(a, b)
+vmin_s8_(a, b)
 }
 
-/// Signed saturating rounding shift left
+/// Minimum (vector)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshl))]
-pub unsafe fn vqrshlq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smin))]
+pub unsafe fn vminq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshifts.v2i64")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshl.v2i64")]
-        fn vqrshlq_s64_(a: int64x2_t, b: int64x2_t) -> int64x2_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmins.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smin.v16i8")]
+        fn vminq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t;
     }
-vqrshlq_s64_(a, b)
+vminq_s8_(a, b)
 }
 
-/// Unsigned signed saturating rounding shift left
+/// Minimum (vector)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqrshl))]
-pub unsafe fn vqrshl_u8(a: uint8x8_t, b: int8x8_t) -> uint8x8_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smin))]
+pub unsafe fn vmin_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftu.v8i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshl.v8i8")]
-        fn vqrshl_u8_(a: uint8x8_t, b: int8x8_t) -> uint8x8_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmins.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smin.v4i16")]
+        fn vmin_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t;
     }
-vqrshl_u8_(a, b)
+vmin_s16_(a, b)
 }
 
-/// Unsigned signed saturating rounding shift left
+/// Minimum (vector)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqrshl))]
-pub unsafe fn vqrshlq_u8(a: uint8x16_t, b: int8x16_t) -> uint8x16_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smin))]
+pub unsafe fn vminq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftu.v16i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshl.v16i8")]
-        fn vqrshlq_u8_(a: uint8x16_t, b: int8x16_t) -> uint8x16_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmins.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smin.v8i16")]
+        fn vminq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
     }
-vqrshlq_u8_(a, b)
+vminq_s16_(a, b)
 }
 
-/// Unsigned signed saturating rounding shift left
+/// Minimum (vector)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqrshl))]
-pub unsafe fn vqrshl_u16(a: uint16x4_t, b: int16x4_t) -> uint16x4_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smin))]
+pub unsafe fn vmin_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftu.v4i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshl.v4i16")]
-        fn vqrshl_u16_(a: uint16x4_t, b: int16x4_t) -> uint16x4_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmins.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smin.v2i32")]
+        fn vmin_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t;
     }
-vqrshl_u16_(a, b)
+vmin_s32_(a, b)
 }
 
-/// Unsigned signed saturating rounding shift left
+/// Minimum (vector)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqrshl))]
-pub unsafe fn vqrshlq_u16(a: uint16x8_t, b: int16x8_t) -> uint16x8_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smin))]
+pub unsafe fn vminq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftu.v8i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshl.v8i16")]
-        fn vqrshlq_u16_(a: uint16x8_t, b: int16x8_t) -> uint16x8_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmins.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smin.v4i32")]
+        fn vminq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
     }
-vqrshlq_u16_(a, b)
+vminq_s32_(a, b)
 }
 
-/// Unsigned signed saturating rounding shift left
+/// Minimum (vector)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqrshl))]
-pub unsafe fn vqrshl_u32(a: uint32x2_t, b: int32x2_t) -> uint32x2_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umin))]
+pub unsafe fn vmin_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftu.v2i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshl.v2i32")]
-        fn vqrshl_u32_(a: uint32x2_t, b: int32x2_t) -> uint32x2_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vminu.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umin.v8i8")]
+        fn vmin_u8_(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t;
     }
-vqrshl_u32_(a, b)
+vmin_u8_(a, b)
 }
 
-/// Unsigned signed saturating rounding shift left
+/// Minimum (vector)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqrshl))]
-pub unsafe fn vqrshlq_u32(a: uint32x4_t, b: int32x4_t) -> uint32x4_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umin))]
+pub unsafe fn vminq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftu.v4i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshl.v4i32")]
-        fn vqrshlq_u32_(a: uint32x4_t, b: int32x4_t) -> uint32x4_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vminu.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umin.v16i8")]
+        fn vminq_u8_(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t;
     }
-vqrshlq_u32_(a, b)
+vminq_u8_(a, b)
 }
 
-/// Unsigned signed saturating rounding shift left
+/// Minimum (vector)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqrshl))]
-pub unsafe fn vqrshl_u64(a: uint64x1_t, b: int64x1_t) -> uint64x1_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umin))]
+pub unsafe fn vmin_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftu.v1i64")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshl.v1i64")]
-        fn vqrshl_u64_(a: uint64x1_t, b: int64x1_t) -> uint64x1_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vminu.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umin.v4i16")]
+        fn vmin_u16_(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t;
     }
-vqrshl_u64_(a, b)
+vmin_u16_(a, b)
 }
 
-/// Unsigned signed saturating rounding shift left
+/// Minimum (vector)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqrshl))]
-pub unsafe fn vqrshlq_u64(a: uint64x2_t, b: int64x2_t) -> uint64x2_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umin))]
+pub unsafe fn vminq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftu.v2i64")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshl.v2i64")]
-        fn vqrshlq_u64_(a: uint64x2_t, b: int64x2_t) -> uint64x2_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vminu.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umin.v8i16")]
+        fn vminq_u16_(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t;
     }
-vqrshlq_u64_(a, b)
+vminq_u16_(a, b)
 }
 
-/// Signed saturating rounded shift right narrow
+/// Minimum (vector)
 #[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqrshrn_n_s16<const N: i32>(a: int16x8_t) -> int8x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umin))]
+pub unsafe fn vmin_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftns.v8i8")]
-        fn vqrshrn_n_s16_(a: int16x8_t, n: int16x8_t) -> int8x8_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vminu.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umin.v2i32")]
+        fn vmin_u32_(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t;
     }
-vqrshrn_n_s16_(a, int16x8_t(-N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16))
+vmin_u32_(a, b)
 }
 
-/// Signed saturating rounded shift right narrow
+/// Minimum (vector)
 #[inline]
-#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqrshrn_n_s16<const N: i32>(a: int16x8_t) -> int8x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umin))]
+pub unsafe fn vminq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshrn.v8i8")]
-        fn vqrshrn_n_s16_(a: int16x8_t, n: i32) -> int8x8_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vminu.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umin.v4i32")]
+        fn vminq_u32_(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t;
     }
-vqrshrn_n_s16_(a, N)
+vminq_u32_(a, b)
 }
 
-/// Signed saturating rounded shift right narrow
+/// Minimum (vector)
 #[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqrshrn_n_s32<const N: i32>(a: int32x4_t) -> int16x4_t {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmin))]
+pub unsafe fn vmin_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftns.v4i16")]
-        fn vqrshrn_n_s32_(a: int32x4_t, n: int32x4_t) -> int16x4_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmins.v2f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmin.v2f32")]
+        fn vmin_f32_(a: float32x2_t, b: float32x2_t) -> float32x2_t;
     }
-vqrshrn_n_s32_(a, int32x4_t(-N as i32, -N as i32, -N as i32, -N as i32))
+vmin_f32_(a, b)
 }
 
-/// Signed saturating rounded shift right narrow
+/// Minimum (vector)
 #[inline]
-#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqrshrn_n_s32<const N: i32>(a: int32x4_t) -> int16x4_t {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmin))]
+pub unsafe fn vminq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshrn.v4i16")]
-        fn vqrshrn_n_s32_(a: int32x4_t, n: i32) -> int16x4_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmins.v4f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmin.v4f32")]
+        fn vminq_f32_(a: float32x4_t, b: float32x4_t) -> float32x4_t;
     }
-vqrshrn_n_s32_(a, N)
+vminq_f32_(a, b)
 }
 
-/// Signed saturating rounded shift right narrow
+/// Floating-point Minimun Number (vector)
 #[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqrshrn_n_s64<const N: i32>(a: int64x2_t) -> int32x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vminnm))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fminnm))]
+pub unsafe fn vminnm_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftns.v2i32")]
-        fn vqrshrn_n_s64_(a: int64x2_t, n: int64x2_t) -> int32x2_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vminnm.v2f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fminnm.v2f32")]
+        fn vminnm_f32_(a: float32x2_t, b: float32x2_t) -> float32x2_t;
     }
-vqrshrn_n_s64_(a, int64x2_t(-N as i64, -N as i64))
+vminnm_f32_(a, b)
 }
 
-/// Signed saturating rounded shift right narrow
+/// Floating-point Minimun Number (vector)
 #[inline]
-#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqrshrn_n_s64<const N: i32>(a: int64x2_t) -> int32x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
+#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vminnm))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fminnm))]
+pub unsafe fn vminnmq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshrn.v2i32")]
-        fn vqrshrn_n_s64_(a: int64x2_t, n: i32) -> int32x2_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vminnm.v4f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fminnm.v4f32")]
+        fn vminnmq_f32_(a: float32x4_t, b: float32x4_t) -> float32x4_t;
     }
-vqrshrn_n_s64_(a, N)
+vminnmq_f32_(a, b)
 }
 
-/// Unsigned signed saturating rounded shift right narrow
+/// Signed saturating doubling multiply long
 #[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqrshrn_n_u16<const N: i32>(a: uint16x8_t) -> uint8x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmull))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmull))]
+pub unsafe fn vqdmull_s16(a: int16x4_t, b: int16x4_t) -> int32x4_t {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftnu.v8i8")]
-        fn vqrshrn_n_u16_(a: uint16x8_t, n: uint16x8_t) -> uint8x8_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqdmull.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqdmull.v4i32")]
+        fn vqdmull_s16_(a: int16x4_t, b: int16x4_t) -> int32x4_t;
     }
-vqrshrn_n_u16_(a, uint16x8_t(-N as u16, -N as u16, -N as u16, -N as u16, -N as u16, -N as u16, -N as u16, -N as u16))
+vqdmull_s16_(a, b)
 }
 
-/// Unsigned signed saturating rounded shift right narrow
+/// Signed saturating doubling multiply long
 #[inline]
-#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqrshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqrshrn_n_u16<const N: i32>(a: uint16x8_t) -> uint8x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmull))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmull))]
+pub unsafe fn vqdmull_s32(a: int32x2_t, b: int32x2_t) -> int64x2_t {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshrn.v8i8")]
-        fn vqrshrn_n_u16_(a: uint16x8_t, n: i32) -> uint8x8_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqdmull.v2i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqdmull.v2i64")]
+        fn vqdmull_s32_(a: int32x2_t, b: int32x2_t) -> int64x2_t;
     }
-vqrshrn_n_u16_(a, N)
+vqdmull_s32_(a, b)
 }
 
-/// Unsigned signed saturating rounded shift right narrow
+/// Vector saturating doubling long multiply with scalar
 #[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqrshrn_n_u32<const N: i32>(a: uint32x4_t) -> uint16x4_t {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftnu.v4i16")]
-        fn vqrshrn_n_u32_(a: uint32x4_t, n: uint32x4_t) -> uint16x4_t;
-    }
-vqrshrn_n_u32_(a, uint32x4_t(-N as u32, -N as u32, -N as u32, -N as u32))
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmull))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmull))]
+pub unsafe fn vqdmull_n_s16(a: int16x4_t, b: i16) -> int32x4_t {
+    vqdmull_s16(a, vdup_n_s16(b))
 }
 
-/// Unsigned signed saturating rounded shift right narrow
+/// Vector saturating doubling long multiply with scalar
 #[inline]
-#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqrshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqrshrn_n_u32<const N: i32>(a: uint32x4_t) -> uint16x4_t {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshrn.v4i16")]
-        fn vqrshrn_n_u32_(a: uint32x4_t, n: i32) -> uint16x4_t;
-    }
-vqrshrn_n_u32_(a, N)
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmull))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmull))]
+pub unsafe fn vqdmull_n_s32(a: int32x2_t, b: i32) -> int64x2_t {
+    vqdmull_s32(a, vdup_n_s32(b))
 }
 
-/// Unsigned signed saturating rounded shift right narrow
+/// Vector saturating doubling long multiply by scalar
 #[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqrshrn_n_u64<const N: i32>(a: uint64x2_t) -> uint32x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftnu.v2i32")]
-        fn vqrshrn_n_u64_(a: uint64x2_t, n: uint64x2_t) -> uint32x2_t;
-    }
-vqrshrn_n_u64_(a, uint64x2_t(-N as u64, -N as u64))
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmull, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmull, N = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vqdmull_lane_s16<const N: i32>(a: int16x4_t, b: int16x4_t) -> int32x4_t {
+    static_assert_imm2!(N);
+    let b: int16x4_t = simd_shuffle4!(b, b, <const N: i32> [N as u32, N as u32, N as u32, N as u32]);
+    vqdmull_s16(a, b)
 }
 
-/// Unsigned signed saturating rounded shift right narrow
+/// Vector saturating doubling long multiply by scalar
 #[inline]
-#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqrshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqrshrn_n_u64<const N: i32>(a: uint64x2_t) -> uint32x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshrn.v2i32")]
-        fn vqrshrn_n_u64_(a: uint64x2_t, n: i32) -> uint32x2_t;
-    }
-vqrshrn_n_u64_(a, N)
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmull, N = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmull, N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vqdmull_lane_s32<const N: i32>(a: int32x2_t, b: int32x2_t) -> int64x2_t {
+    static_assert_imm1!(N);
+    let b: int32x2_t = simd_shuffle2!(b, b, <const N: i32> [N as u32, N as u32]);
+    vqdmull_s32(a, b)
 }
 
-/// Signed saturating rounded shift right unsigned narrow
+/// Signed saturating doubling multiply-add long
 #[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshrun, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqrshrun_n_s16<const N: i32>(a: int16x8_t) -> uint8x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftnsu.v8i8")]
-        fn vqrshrun_n_s16_(a: int16x8_t, n: int16x8_t) -> uint8x8_t;
-    }
-vqrshrun_n_s16_(a, int16x8_t(-N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16))
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlal))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlal))]
+pub unsafe fn vqdmlal_s16(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t {
+    vqaddq_s32(a, vqdmull_s16(b, c))
 }
 
-/// Signed saturating rounded shift right unsigned narrow
+/// Signed saturating doubling multiply-add long
 #[inline]
-#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshrun, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqrshrun_n_s16<const N: i32>(a: int16x8_t) -> uint8x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshrun.v8i8")]
-        fn vqrshrun_n_s16_(a: int16x8_t, n: i32) -> uint8x8_t;
-    }
-vqrshrun_n_s16_(a, N)
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlal))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlal))]
+pub unsafe fn vqdmlal_s32(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t {
+    vqaddq_s64(a, vqdmull_s32(b, c))
 }
 
-/// Signed saturating rounded shift right unsigned narrow
+/// Vector widening saturating doubling multiply accumulate with scalar
 #[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshrun, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqrshrun_n_s32<const N: i32>(a: int32x4_t) -> uint16x4_t {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftnsu.v4i16")]
-        fn vqrshrun_n_s32_(a: int32x4_t, n: int32x4_t) -> uint16x4_t;
-    }
-vqrshrun_n_s32_(a, int32x4_t(-N as i32, -N as i32, -N as i32, -N as i32))
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlal))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlal))]
+pub unsafe fn vqdmlal_n_s16(a: int32x4_t, b: int16x4_t, c: i16) -> int32x4_t {
+    vqaddq_s32(a, vqdmull_n_s16(b, c))
 }
 
-/// Signed saturating rounded shift right unsigned narrow
+/// Vector widening saturating doubling multiply accumulate with scalar
 #[inline]
-#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshrun, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqrshrun_n_s32<const N: i32>(a: int32x4_t) -> uint16x4_t {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshrun.v4i16")]
-        fn vqrshrun_n_s32_(a: int32x4_t, n: i32) -> uint16x4_t;
-    }
-vqrshrun_n_s32_(a, N)
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlal))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlal))]
+pub unsafe fn vqdmlal_n_s32(a: int64x2_t, b: int32x2_t, c: i32) -> int64x2_t {
+    vqaddq_s64(a, vqdmull_n_s32(b, c))
 }
 
-/// Signed saturating rounded shift right unsigned narrow
+/// Vector widening saturating doubling multiply accumulate with scalar
 #[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshrun, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqrshrun_n_s64<const N: i32>(a: int64x2_t) -> uint32x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftnsu.v2i32")]
-        fn vqrshrun_n_s64_(a: int64x2_t, n: int64x2_t) -> uint32x2_t;
-    }
-vqrshrun_n_s64_(a, int64x2_t(-N as i64, -N as i64))
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlal, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlal, N = 2))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vqdmlal_lane_s16<const N: i32>(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t {
+    static_assert_imm2!(N);
+    vqaddq_s32(a, vqdmull_lane_s16::<N>(b, c))
 }
 
-/// Signed saturating rounded shift right unsigned narrow
+/// Vector widening saturating doubling multiply accumulate with scalar
 #[inline]
-#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshrun, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqrshrun_n_s64<const N: i32>(a: int64x2_t) -> uint32x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshrun.v2i32")]
-        fn vqrshrun_n_s64_(a: int64x2_t, n: i32) -> uint32x2_t;
-    }
-vqrshrun_n_s64_(a, N)
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlal, N = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlal, N = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vqdmlal_lane_s32<const N: i32>(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t {
+    static_assert_imm1!(N);
+    vqaddq_s64(a, vqdmull_lane_s32::<N>(b, c))
 }
 
-/// Signed saturating shift left
+/// Signed saturating doubling multiply-subtract long
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl))]
-pub unsafe fn vqshl_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshifts.v8i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshl.v8i8")]
-        fn vqshl_s8_(a: int8x8_t, b: int8x8_t) -> int8x8_t;
-    }
-vqshl_s8_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlsl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlsl))]
+pub unsafe fn vqdmlsl_s16(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t {
+    vqsubq_s32(a, vqdmull_s16(b, c))
 }
 
-/// Signed saturating shift left
+/// Signed saturating doubling multiply-subtract long
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl))]
-pub unsafe fn vqshlq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshifts.v16i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshl.v16i8")]
-        fn vqshlq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t;
-    }
-vqshlq_s8_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlsl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlsl))]
+pub unsafe fn vqdmlsl_s32(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t {
+    vqsubq_s64(a, vqdmull_s32(b, c))
 }
 
-/// Signed saturating shift left
+/// Vector widening saturating doubling multiply subtract with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl))]
-pub unsafe fn vqshl_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshifts.v4i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshl.v4i16")]
-        fn vqshl_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t;
-    }
-vqshl_s16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlsl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlsl))]
+pub unsafe fn vqdmlsl_n_s16(a: int32x4_t, b: int16x4_t, c: i16) -> int32x4_t {
+    vqsubq_s32(a, vqdmull_n_s16(b, c))
 }
 
-/// Signed saturating shift left
+/// Vector widening saturating doubling multiply subtract with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl))]
-pub unsafe fn vqshlq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshifts.v8i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshl.v8i16")]
-        fn vqshlq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
-    }
-vqshlq_s16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlsl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlsl))]
+pub unsafe fn vqdmlsl_n_s32(a: int64x2_t, b: int32x2_t, c: i32) -> int64x2_t {
+    vqsubq_s64(a, vqdmull_n_s32(b, c))
 }
 
-/// Signed saturating shift left
+/// Vector widening saturating doubling multiply subtract with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl))]
-pub unsafe fn vqshl_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshifts.v2i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshl.v2i32")]
-        fn vqshl_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t;
-    }
-vqshl_s32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlsl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlsl, N = 2))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vqdmlsl_lane_s16<const N: i32>(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t {
+    static_assert_imm2!(N);
+    vqsubq_s32(a, vqdmull_lane_s16::<N>(b, c))
 }
 
-/// Signed saturating shift left
+/// Vector widening saturating doubling multiply subtract with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl))]
-pub unsafe fn vqshlq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlsl, N = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlsl, N = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vqdmlsl_lane_s32<const N: i32>(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t {
+    static_assert_imm1!(N);
+    vqsubq_s64(a, vqdmull_lane_s32::<N>(b, c))
+}
+
+/// Signed saturating doubling multiply returning high half
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmulh))]
+pub unsafe fn vqdmulh_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshifts.v4i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshl.v4i32")]
-        fn vqshlq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqdmulh.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqdmulh.v4i16")]
+        fn vqdmulh_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t;
     }
-vqshlq_s32_(a, b)
+vqdmulh_s16_(a, b)
 }
 
-/// Signed saturating shift left
+/// Signed saturating doubling multiply returning high half
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl))]
-pub unsafe fn vqshl_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmulh))]
+pub unsafe fn vqdmulhq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshifts.v1i64")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshl.v1i64")]
-        fn vqshl_s64_(a: int64x1_t, b: int64x1_t) -> int64x1_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqdmulh.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqdmulh.v8i16")]
+        fn vqdmulhq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
     }
-vqshl_s64_(a, b)
+vqdmulhq_s16_(a, b)
 }
 
-/// Signed saturating shift left
+/// Signed saturating doubling multiply returning high half
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl))]
-pub unsafe fn vqshlq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmulh))]
+pub unsafe fn vqdmulh_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshifts.v2i64")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshl.v2i64")]
-        fn vqshlq_s64_(a: int64x2_t, b: int64x2_t) -> int64x2_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqdmulh.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqdmulh.v2i32")]
+        fn vqdmulh_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t;
     }
-vqshlq_s64_(a, b)
+vqdmulh_s32_(a, b)
 }
 
-/// Unsigned saturating shift left
+/// Signed saturating doubling multiply returning high half
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl))]
-pub unsafe fn vqshl_u8(a: uint8x8_t, b: int8x8_t) -> uint8x8_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmulh))]
+pub unsafe fn vqdmulhq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftu.v8i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshl.v8i8")]
-        fn vqshl_u8_(a: uint8x8_t, b: int8x8_t) -> uint8x8_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqdmulh.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqdmulh.v4i32")]
+        fn vqdmulhq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
     }
-vqshl_u8_(a, b)
+vqdmulhq_s32_(a, b)
 }
 
-/// Unsigned saturating shift left
+/// Vector saturating doubling multiply high with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl))]
-pub unsafe fn vqshlq_u8(a: uint8x16_t, b: int8x16_t) -> uint8x16_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftu.v16i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshl.v16i8")]
-        fn vqshlq_u8_(a: uint8x16_t, b: int8x16_t) -> uint8x16_t;
-    }
-vqshlq_u8_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmulh))]
+pub unsafe fn vqdmulh_n_s16(a: int16x4_t, b: i16) -> int16x4_t {
+    let b: int16x4_t = vdup_n_s16(b);
+    vqdmulh_s16(a, b)
 }
 
-/// Unsigned saturating shift left
+/// Vector saturating doubling multiply high with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl))]
-pub unsafe fn vqshl_u16(a: uint16x4_t, b: int16x4_t) -> uint16x4_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmulh))]
+pub unsafe fn vqdmulh_n_s32(a: int32x2_t, b: i32) -> int32x2_t {
+    let b: int32x2_t = vdup_n_s32(b);
+    vqdmulh_s32(a, b)
+}
+
+/// Vector saturating doubling multiply high with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmulh))]
+pub unsafe fn vqdmulhq_nq_s16(a: int16x8_t, b: i16) -> int16x8_t {
+    let b: int16x8_t = vdupq_n_s16(b);
+    vqdmulhq_s16(a, b)
+}
+
+/// Vector saturating doubling multiply high with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmulh))]
+pub unsafe fn vqdmulhq_nq_s32(a: int32x4_t, b: i32) -> int32x4_t {
+    let b: int32x4_t = vdupq_n_s32(b);
+    vqdmulhq_s32(a, b)
+}
+
+/// Signed saturating extract narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqmovn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqxtn))]
+pub unsafe fn vqmovn_s16(a: int16x8_t) -> int8x8_t {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftu.v4i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshl.v4i16")]
-        fn vqshl_u16_(a: uint16x4_t, b: int16x4_t) -> uint16x4_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqmovns.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqxtn.v8i8")]
+        fn vqmovn_s16_(a: int16x8_t) -> int8x8_t;
     }
-vqshl_u16_(a, b)
+vqmovn_s16_(a)
 }
 
-/// Unsigned saturating shift left
+/// Signed saturating extract narrow
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl))]
-pub unsafe fn vqshlq_u16(a: uint16x8_t, b: int16x8_t) -> uint16x8_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqmovn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqxtn))]
+pub unsafe fn vqmovn_s32(a: int32x4_t) -> int16x4_t {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftu.v8i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshl.v8i16")]
-        fn vqshlq_u16_(a: uint16x8_t, b: int16x8_t) -> uint16x8_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqmovns.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqxtn.v4i16")]
+        fn vqmovn_s32_(a: int32x4_t) -> int16x4_t;
     }
-vqshlq_u16_(a, b)
+vqmovn_s32_(a)
 }
 
-/// Unsigned saturating shift left
+/// Signed saturating extract narrow
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl))]
-pub unsafe fn vqshl_u32(a: uint32x2_t, b: int32x2_t) -> uint32x2_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqmovn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqxtn))]
+pub unsafe fn vqmovn_s64(a: int64x2_t) -> int32x2_t {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftu.v2i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshl.v2i32")]
-        fn vqshl_u32_(a: uint32x2_t, b: int32x2_t) -> uint32x2_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqmovns.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqxtn.v2i32")]
+        fn vqmovn_s64_(a: int64x2_t) -> int32x2_t;
     }
-vqshl_u32_(a, b)
+vqmovn_s64_(a)
 }
 
-/// Unsigned saturating shift left
+/// Unsigned saturating extract narrow
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl))]
-pub unsafe fn vqshlq_u32(a: uint32x4_t, b: int32x4_t) -> uint32x4_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqmovn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqxtn))]
+pub unsafe fn vqmovn_u16(a: uint16x8_t) -> uint8x8_t {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftu.v4i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshl.v4i32")]
-        fn vqshlq_u32_(a: uint32x4_t, b: int32x4_t) -> uint32x4_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqmovnu.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqxtn.v8i8")]
+        fn vqmovn_u16_(a: uint16x8_t) -> uint8x8_t;
     }
-vqshlq_u32_(a, b)
+vqmovn_u16_(a)
 }
 
-/// Unsigned saturating shift left
+/// Unsigned saturating extract narrow
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl))]
-pub unsafe fn vqshl_u64(a: uint64x1_t, b: int64x1_t) -> uint64x1_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqmovn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqxtn))]
+pub unsafe fn vqmovn_u32(a: uint32x4_t) -> uint16x4_t {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftu.v1i64")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshl.v1i64")]
-        fn vqshl_u64_(a: uint64x1_t, b: int64x1_t) -> uint64x1_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqmovnu.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqxtn.v4i16")]
+        fn vqmovn_u32_(a: uint32x4_t) -> uint16x4_t;
     }
-vqshl_u64_(a, b)
+vqmovn_u32_(a)
 }
 
-/// Unsigned saturating shift left
+/// Unsigned saturating extract narrow
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl))]
-pub unsafe fn vqshlq_u64(a: uint64x2_t, b: int64x2_t) -> uint64x2_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqmovn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqxtn))]
+pub unsafe fn vqmovn_u64(a: uint64x2_t) -> uint32x2_t {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftu.v2i64")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshl.v2i64")]
-        fn vqshlq_u64_(a: uint64x2_t, b: int64x2_t) -> uint64x2_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqmovnu.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqxtn.v2i32")]
+        fn vqmovn_u64_(a: uint64x2_t) -> uint32x2_t;
     }
-vqshlq_u64_(a, b)
+vqmovn_u64_(a)
 }
 
-/// Signed saturating shift left
+/// Signed saturating extract unsigned narrow
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshl_n_s8<const N: i32>(a: int8x8_t) -> int8x8_t {
-    static_assert_imm3!(N);
-    vqshl_s8(a, vdup_n_s8(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqmovun))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqxtun))]
+pub unsafe fn vqmovun_s16(a: int16x8_t) -> uint8x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqmovnsu.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqxtun.v8i8")]
+        fn vqmovun_s16_(a: int16x8_t) -> uint8x8_t;
+    }
+vqmovun_s16_(a)
 }
 
-/// Signed saturating shift left
+/// Signed saturating extract unsigned narrow
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshlq_n_s8<const N: i32>(a: int8x16_t) -> int8x16_t {
-    static_assert_imm3!(N);
-    vqshlq_s8(a, vdupq_n_s8(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqmovun))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqxtun))]
+pub unsafe fn vqmovun_s32(a: int32x4_t) -> uint16x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqmovnsu.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqxtun.v4i16")]
+        fn vqmovun_s32_(a: int32x4_t) -> uint16x4_t;
+    }
+vqmovun_s32_(a)
 }
 
-/// Signed saturating shift left
+/// Signed saturating extract unsigned narrow
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshl_n_s16<const N: i32>(a: int16x4_t) -> int16x4_t {
-    static_assert_imm4!(N);
-    vqshl_s16(a, vdup_n_s16(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqmovun))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqxtun))]
+pub unsafe fn vqmovun_s64(a: int64x2_t) -> uint32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqmovnsu.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqxtun.v2i32")]
+        fn vqmovun_s64_(a: int64x2_t) -> uint32x2_t;
+    }
+vqmovun_s64_(a)
 }
 
-/// Signed saturating shift left
+/// Signed saturating rounding doubling multiply returning high half
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshlq_n_s16<const N: i32>(a: int16x8_t) -> int16x8_t {
-    static_assert_imm4!(N);
-    vqshlq_s16(a, vdupq_n_s16(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh))]
+pub unsafe fn vqrdmulh_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrdmulh.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrdmulh.v4i16")]
+        fn vqrdmulh_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t;
+    }
+vqrdmulh_s16_(a, b)
 }
 
-/// Signed saturating shift left
+/// Signed saturating rounding doubling multiply returning high half
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshl_n_s32<const N: i32>(a: int32x2_t) -> int32x2_t {
-    static_assert_imm5!(N);
-    vqshl_s32(a, vdup_n_s32(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh))]
+pub unsafe fn vqrdmulhq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrdmulh.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrdmulh.v8i16")]
+        fn vqrdmulhq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
+    }
+vqrdmulhq_s16_(a, b)
 }
 
-/// Signed saturating shift left
+/// Signed saturating rounding doubling multiply returning high half
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshlq_n_s32<const N: i32>(a: int32x4_t) -> int32x4_t {
-    static_assert_imm5!(N);
-    vqshlq_s32(a, vdupq_n_s32(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh))]
+pub unsafe fn vqrdmulh_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrdmulh.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrdmulh.v2i32")]
+        fn vqrdmulh_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t;
+    }
+vqrdmulh_s32_(a, b)
 }
 
-/// Signed saturating shift left
+/// Signed saturating rounding doubling multiply returning high half
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshl_n_s64<const N: i32>(a: int64x1_t) -> int64x1_t {
-    static_assert_imm6!(N);
-    vqshl_s64(a, vdup_n_s64(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh))]
+pub unsafe fn vqrdmulhq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrdmulh.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrdmulh.v4i32")]
+        fn vqrdmulhq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+    }
+vqrdmulhq_s32_(a, b)
 }
 
-/// Signed saturating shift left
+/// Vector saturating rounding doubling multiply high with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshlq_n_s64<const N: i32>(a: int64x2_t) -> int64x2_t {
-    static_assert_imm6!(N);
-    vqshlq_s64(a, vdupq_n_s64(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh))]
+pub unsafe fn vqrdmulh_n_s16(a: int16x4_t, b: i16) -> int16x4_t {
+    vqrdmulh_s16(a, vdup_n_s16(b))
 }
 
-/// Unsigned saturating shift left
+/// Vector saturating rounding doubling multiply high with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshl_n_u8<const N: i32>(a: uint8x8_t) -> uint8x8_t {
-    static_assert_imm3!(N);
-    vqshl_u8(a, vdup_n_s8(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh))]
+pub unsafe fn vqrdmulhq_n_s16(a: int16x8_t, b: i16) -> int16x8_t {
+    vqrdmulhq_s16(a, vdupq_n_s16(b))
 }
 
-/// Unsigned saturating shift left
+/// Vector saturating rounding doubling multiply high with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshlq_n_u8<const N: i32>(a: uint8x16_t) -> uint8x16_t {
-    static_assert_imm3!(N);
-    vqshlq_u8(a, vdupq_n_s8(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh))]
+pub unsafe fn vqrdmulh_n_s32(a: int32x2_t, b: i32) -> int32x2_t {
+    vqrdmulh_s32(a, vdup_n_s32(b))
 }
 
-/// Unsigned saturating shift left
+/// Vector saturating rounding doubling multiply high with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshl_n_u16<const N: i32>(a: uint16x4_t) -> uint16x4_t {
-    static_assert_imm4!(N);
-    vqshl_u16(a, vdup_n_s16(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh))]
+pub unsafe fn vqrdmulhq_n_s32(a: int32x4_t, b: i32) -> int32x4_t {
+    vqrdmulhq_s32(a, vdupq_n_s32(b))
 }
 
-/// Unsigned saturating shift left
+/// Vector rounding saturating doubling multiply high by scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshlq_n_u16<const N: i32>(a: uint16x8_t) -> uint16x8_t {
-    static_assert_imm4!(N);
-    vqshlq_u16(a, vdupq_n_s16(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vqrdmulh_lane_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    static_assert_imm2!(LANE);
+    let b: int16x4_t = simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+    vqrdmulh_s16(a, b)
 }
 
-/// Unsigned saturating shift left
+/// Vector rounding saturating doubling multiply high by scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshl_n_u32<const N: i32>(a: uint32x2_t) -> uint32x2_t {
-    static_assert_imm5!(N);
-    vqshl_u32(a, vdup_n_s32(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vqrdmulh_laneq_s16<const LANE: i32>(a: int16x4_t, b: int16x8_t) -> int16x4_t {
+    static_assert_imm3!(LANE);
+    let b: int16x4_t = simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+    vqrdmulh_s16(a, b)
 }
 
-/// Unsigned saturating shift left
+/// Vector rounding saturating doubling multiply high by scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshlq_n_u32<const N: i32>(a: uint32x4_t) -> uint32x4_t {
-    static_assert_imm5!(N);
-    vqshlq_u32(a, vdupq_n_s32(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vqrdmulhq_lane_s16<const LANE: i32>(a: int16x8_t, b: int16x4_t) -> int16x8_t {
+    static_assert_imm2!(LANE);
+    let b: int16x8_t = simd_shuffle8!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+    vqrdmulhq_s16(a, b)
 }
 
-/// Unsigned saturating shift left
+/// Vector rounding saturating doubling multiply high by scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshl_n_u64<const N: i32>(a: uint64x1_t) -> uint64x1_t {
-    static_assert_imm6!(N);
-    vqshl_u64(a, vdup_n_s64(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vqrdmulhq_laneq_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    static_assert_imm3!(LANE);
+    let b: int16x8_t = simd_shuffle8!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+    vqrdmulhq_s16(a, b)
 }
 
-/// Unsigned saturating shift left
+/// Vector rounding saturating doubling multiply high by scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshlq_n_u64<const N: i32>(a: uint64x2_t) -> uint64x2_t {
-    static_assert_imm6!(N);
-    vqshlq_u64(a, vdupq_n_s64(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vqrdmulh_lane_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    static_assert_imm1!(LANE);
+    let b: int32x2_t = simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]);
+    vqrdmulh_s32(a, b)
 }
 
-/// Signed saturating shift right narrow
+/// Vector rounding saturating doubling multiply high by scalar
 #[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshrn_n_s16<const N: i32>(a: int16x8_t) -> int8x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftns.v8i8")]
-        fn vqshrn_n_s16_(a: int16x8_t, n: int16x8_t) -> int8x8_t;
-    }
-vqshrn_n_s16_(a, int16x8_t(-N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16))
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vqrdmulh_laneq_s32<const LANE: i32>(a: int32x2_t, b: int32x4_t) -> int32x2_t {
+    static_assert_imm2!(LANE);
+    let b: int32x2_t = simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]);
+    vqrdmulh_s32(a, b)
 }
 
-/// Signed saturating shift right narrow
+/// Vector rounding saturating doubling multiply high by scalar
 #[inline]
-#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshrn_n_s16<const N: i32>(a: int16x8_t) -> int8x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshrn.v8i8")]
-        fn vqshrn_n_s16_(a: int16x8_t, n: i32) -> int8x8_t;
-    }
-vqshrn_n_s16_(a, N)
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vqrdmulhq_lane_s32<const LANE: i32>(a: int32x4_t, b: int32x2_t) -> int32x4_t {
+    static_assert_imm1!(LANE);
+    let b: int32x4_t = simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+    vqrdmulhq_s32(a, b)
 }
 
-/// Signed saturating shift right narrow
+/// Vector rounding saturating doubling multiply high by scalar
 #[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshrn_n_s32<const N: i32>(a: int32x4_t) -> int16x4_t {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftns.v4i16")]
-        fn vqshrn_n_s32_(a: int32x4_t, n: int32x4_t) -> int16x4_t;
-    }
-vqshrn_n_s32_(a, int32x4_t(-N as i32, -N as i32, -N as i32, -N as i32))
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vqrdmulhq_laneq_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    static_assert_imm2!(LANE);
+    let b: int32x4_t = simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+    vqrdmulhq_s32(a, b)
 }
 
-/// Signed saturating shift right narrow
+/// Signed saturating rounding doubling multiply accumulate returning high half
 #[inline]
-#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshrn_n_s32<const N: i32>(a: int32x4_t) -> int16x4_t {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshrn.v4i16")]
-        fn vqshrn_n_s32_(a: int32x4_t, n: i32) -> int16x4_t;
-    }
-vqshrn_n_s32_(a, N)
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh))]
+pub unsafe fn vqrdmlah_s16(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t {
+    vqadd_s16(a, vqrdmulh_s16(b, c))
 }
 
-/// Signed saturating shift right narrow
+/// Signed saturating rounding doubling multiply accumulate returning high half
 #[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshrn_n_s64<const N: i32>(a: int64x2_t) -> int32x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftns.v2i32")]
-        fn vqshrn_n_s64_(a: int64x2_t, n: int64x2_t) -> int32x2_t;
-    }
-vqshrn_n_s64_(a, int64x2_t(-N as i64, -N as i64))
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh))]
+pub unsafe fn vqrdmlahq_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t {
+    vqaddq_s16(a, vqrdmulhq_s16(b, c))
 }
 
-/// Signed saturating shift right narrow
+/// Signed saturating rounding doubling multiply accumulate returning high half
 #[inline]
-#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshrn_n_s64<const N: i32>(a: int64x2_t) -> int32x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshrn.v2i32")]
-        fn vqshrn_n_s64_(a: int64x2_t, n: i32) -> int32x2_t;
-    }
-vqshrn_n_s64_(a, N)
-}
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh))]
+pub unsafe fn vqrdmlah_s32(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t {
+    vqadd_s32(a, vqrdmulh_s32(b, c))
+}
 
-/// Unsigned saturating shift right narrow
+/// Signed saturating rounding doubling multiply accumulate returning high half
 #[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshrn_n_u16<const N: i32>(a: uint16x8_t) -> uint8x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftnu.v8i8")]
-        fn vqshrn_n_u16_(a: uint16x8_t, n: uint16x8_t) -> uint8x8_t;
-    }
-vqshrn_n_u16_(a, uint16x8_t(-N as u16, -N as u16, -N as u16, -N as u16, -N as u16, -N as u16, -N as u16, -N as u16))
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh))]
+pub unsafe fn vqrdmlahq_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t {
+    vqaddq_s32(a, vqrdmulhq_s32(b, c))
 }
 
-/// Unsigned saturating shift right narrow
+/// Signed saturating rounding doubling multiply accumulate returning high half
 #[inline]
-#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshrn_n_u16<const N: i32>(a: uint16x8_t) -> uint8x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshrn.v8i8")]
-        fn vqshrn_n_u16_(a: uint16x8_t, n: i32) -> uint8x8_t;
-    }
-vqshrn_n_u16_(a, N)
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vqrdmlah_lane_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t {
+    static_assert_imm2!(LANE);
+    vqadd_s16(a, vqrdmulh_lane_s16::<LANE>(b, c))
 }
 
-/// Unsigned saturating shift right narrow
+/// Signed saturating rounding doubling multiply accumulate returning high half
 #[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshrn_n_u32<const N: i32>(a: uint32x4_t) -> uint16x4_t {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftnu.v4i16")]
-        fn vqshrn_n_u32_(a: uint32x4_t, n: uint32x4_t) -> uint16x4_t;
-    }
-vqshrn_n_u32_(a, uint32x4_t(-N as u32, -N as u32, -N as u32, -N as u32))
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vqrdmlah_laneq_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t, c: int16x8_t) -> int16x4_t {
+    static_assert_imm3!(LANE);
+    vqadd_s16(a, vqrdmulh_laneq_s16::<LANE>(b, c))
 }
 
-/// Unsigned saturating shift right narrow
+/// Signed saturating rounding doubling multiply accumulate returning high half
 #[inline]
-#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshrn_n_u32<const N: i32>(a: uint32x4_t) -> uint16x4_t {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshrn.v4i16")]
-        fn vqshrn_n_u32_(a: uint32x4_t, n: i32) -> uint16x4_t;
-    }
-vqshrn_n_u32_(a, N)
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vqrdmlahq_lane_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t, c: int16x4_t) -> int16x8_t {
+    static_assert_imm2!(LANE);
+    vqaddq_s16(a, vqrdmulhq_lane_s16::<LANE>(b, c))
 }
 
-/// Unsigned saturating shift right narrow
+/// Signed saturating rounding doubling multiply accumulate returning high half
 #[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshrn_n_u64<const N: i32>(a: uint64x2_t) -> uint32x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftnu.v2i32")]
-        fn vqshrn_n_u64_(a: uint64x2_t, n: uint64x2_t) -> uint32x2_t;
-    }
-vqshrn_n_u64_(a, uint64x2_t(-N as u64, -N as u64))
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vqrdmlahq_laneq_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t {
+    static_assert_imm3!(LANE);
+    vqaddq_s16(a, vqrdmulhq_laneq_s16::<LANE>(b, c))
 }
 
-/// Unsigned saturating shift right narrow
+/// Signed saturating rounding doubling multiply accumulate returning high half
 #[inline]
-#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshrn_n_u64<const N: i32>(a: uint64x2_t) -> uint32x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshrn.v2i32")]
-        fn vqshrn_n_u64_(a: uint64x2_t, n: i32) -> uint32x2_t;
-    }
-vqshrn_n_u64_(a, N)
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vqrdmlah_lane_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t {
+    static_assert_imm1!(LANE);
+    vqadd_s32(a, vqrdmulh_lane_s32::<LANE>(b, c))
 }
 
-/// Signed saturating shift right unsigned narrow
+/// Signed saturating rounding doubling multiply accumulate returning high half
 #[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshrun, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshrun_n_s16<const N: i32>(a: int16x8_t) -> uint8x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftnsu.v8i8")]
-        fn vqshrun_n_s16_(a: int16x8_t, n: int16x8_t) -> uint8x8_t;
-    }
-vqshrun_n_s16_(a, int16x8_t(-N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16))
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vqrdmlah_laneq_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t, c: int32x4_t) -> int32x2_t {
+    static_assert_imm2!(LANE);
+    vqadd_s32(a, vqrdmulh_laneq_s32::<LANE>(b, c))
 }
 
-/// Signed saturating shift right unsigned narrow
+/// Signed saturating rounding doubling multiply accumulate returning high half
 #[inline]
-#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshrun, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshrun_n_s16<const N: i32>(a: int16x8_t) -> uint8x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshrun.v8i8")]
-        fn vqshrun_n_s16_(a: int16x8_t, n: i32) -> uint8x8_t;
-    }
-vqshrun_n_s16_(a, N)
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vqrdmlahq_lane_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t, c: int32x2_t) -> int32x4_t {
+    static_assert_imm1!(LANE);
+    vqaddq_s32(a, vqrdmulhq_lane_s32::<LANE>(b, c))
 }
 
-/// Signed saturating shift right unsigned narrow
+/// Signed saturating rounding doubling multiply accumulate returning high half
 #[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshrun, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshrun_n_s32<const N: i32>(a: int32x4_t) -> uint16x4_t {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftnsu.v4i16")]
-        fn vqshrun_n_s32_(a: int32x4_t, n: int32x4_t) -> uint16x4_t;
-    }
-vqshrun_n_s32_(a, int32x4_t(-N as i32, -N as i32, -N as i32, -N as i32))
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vqrdmlahq_laneq_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t {
+    static_assert_imm2!(LANE);
+    vqaddq_s32(a, vqrdmulhq_laneq_s32::<LANE>(b, c))
 }
 
-/// Signed saturating shift right unsigned narrow
+/// Signed saturating rounding doubling multiply subtract returning high half
 #[inline]
-#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshrun, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshrun_n_s32<const N: i32>(a: int32x4_t) -> uint16x4_t {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshrun.v4i16")]
-        fn vqshrun_n_s32_(a: int32x4_t, n: i32) -> uint16x4_t;
-    }
-vqshrun_n_s32_(a, N)
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh))]
+pub unsafe fn vqrdmlsh_s16(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t {
+    vqsub_s16(a, vqrdmulh_s16(b, c))
 }
 
-/// Signed saturating shift right unsigned narrow
+/// Signed saturating rounding doubling multiply subtract returning high half
 #[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshrun, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshrun_n_s64<const N: i32>(a: int64x2_t) -> uint32x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftnsu.v2i32")]
-        fn vqshrun_n_s64_(a: int64x2_t, n: int64x2_t) -> uint32x2_t;
-    }
-vqshrun_n_s64_(a, int64x2_t(-N as i64, -N as i64))
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh))]
+pub unsafe fn vqrdmlshq_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t {
+    vqsubq_s16(a, vqrdmulhq_s16(b, c))
 }
 
-/// Signed saturating shift right unsigned narrow
+/// Signed saturating rounding doubling multiply subtract returning high half
 #[inline]
-#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshrun, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshrun_n_s64<const N: i32>(a: int64x2_t) -> uint32x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshrun.v2i32")]
-        fn vqshrun_n_s64_(a: int64x2_t, n: i32) -> uint32x2_t;
-    }
-vqshrun_n_s64_(a, N)
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh))]
+pub unsafe fn vqrdmlsh_s32(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t {
+    vqsub_s32(a, vqrdmulh_s32(b, c))
 }
 
-/// Reciprocal square-root estimate.
+/// Signed saturating rounding doubling multiply subtract returning high half
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsqrte))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(frsqrte))]
-pub unsafe fn vrsqrte_f32(a: float32x2_t) -> float32x2_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrsqrte.v2f32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frsqrte.v2f32")]
-        fn vrsqrte_f32_(a: float32x2_t) -> float32x2_t;
-    }
-vrsqrte_f32_(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh))]
+pub unsafe fn vqrdmlshq_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t {
+    vqsubq_s32(a, vqrdmulhq_s32(b, c))
 }
 
-/// Reciprocal square-root estimate.
+/// Signed saturating rounding doubling multiply subtract returning high half
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsqrte))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(frsqrte))]
-pub unsafe fn vrsqrteq_f32(a: float32x4_t) -> float32x4_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrsqrte.v4f32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frsqrte.v4f32")]
-        fn vrsqrteq_f32_(a: float32x4_t) -> float32x4_t;
-    }
-vrsqrteq_f32_(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vqrdmlsh_lane_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t {
+    static_assert_imm2!(LANE);
+    vqsub_s16(a, vqrdmulh_lane_s16::<LANE>(b, c))
 }
 
-/// Reciprocal estimate.
+/// Signed saturating rounding doubling multiply subtract returning high half
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrecpe))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(frecpe))]
-pub unsafe fn vrecpe_f32(a: float32x2_t) -> float32x2_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrecpe.v2f32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frecpe.v2f32")]
-        fn vrecpe_f32_(a: float32x2_t) -> float32x2_t;
-    }
-vrecpe_f32_(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vqrdmlsh_laneq_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t, c: int16x8_t) -> int16x4_t {
+    static_assert_imm3!(LANE);
+    vqsub_s16(a, vqrdmulh_laneq_s16::<LANE>(b, c))
 }
 
-/// Reciprocal estimate.
+/// Signed saturating rounding doubling multiply subtract returning high half
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrecpe))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(frecpe))]
-pub unsafe fn vrecpeq_f32(a: float32x4_t) -> float32x4_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrecpe.v4f32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frecpe.v4f32")]
-        fn vrecpeq_f32_(a: float32x4_t) -> float32x4_t;
-    }
-vrecpeq_f32_(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vqrdmlshq_lane_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t, c: int16x4_t) -> int16x8_t {
+    static_assert_imm2!(LANE);
+    vqsubq_s16(a, vqrdmulhq_lane_s16::<LANE>(b, c))
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating rounding doubling multiply subtract returning high half
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s8_u8(a: uint8x8_t) -> int8x8_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vqrdmlshq_laneq_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t {
+    static_assert_imm3!(LANE);
+    vqsubq_s16(a, vqrdmulhq_laneq_s16::<LANE>(b, c))
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating rounding doubling multiply subtract returning high half
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s8_p8(a: poly8x8_t) -> int8x8_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vqrdmlsh_lane_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t {
+    static_assert_imm1!(LANE);
+    vqsub_s32(a, vqrdmulh_lane_s32::<LANE>(b, c))
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating rounding doubling multiply subtract returning high half
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s16_p16(a: poly16x4_t) -> int16x4_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vqrdmlsh_laneq_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t, c: int32x4_t) -> int32x2_t {
+    static_assert_imm2!(LANE);
+    vqsub_s32(a, vqrdmulh_laneq_s32::<LANE>(b, c))
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating rounding doubling multiply subtract returning high half
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s16_u16(a: uint16x4_t) -> int16x4_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vqrdmlshq_lane_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t, c: int32x2_t) -> int32x4_t {
+    static_assert_imm1!(LANE);
+    vqsubq_s32(a, vqrdmulhq_lane_s32::<LANE>(b, c))
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating rounding doubling multiply subtract returning high half
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s32_u32(a: uint32x2_t) -> int32x2_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vqrdmlshq_laneq_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t {
+    static_assert_imm2!(LANE);
+    vqsubq_s32(a, vqrdmulhq_laneq_s32::<LANE>(b, c))
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s64_u64(a: uint64x1_t) -> int64x1_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshl))]
+pub unsafe fn vqrshl_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshifts.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshl.v8i8")]
+        fn vqrshl_s8_(a: int8x8_t, b: int8x8_t) -> int8x8_t;
+    }
+vqrshl_s8_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s8_u8(a: uint8x16_t) -> int8x16_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshl))]
+pub unsafe fn vqrshlq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshifts.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshl.v16i8")]
+        fn vqrshlq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t;
+    }
+vqrshlq_s8_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s8_p8(a: poly8x16_t) -> int8x16_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshl))]
+pub unsafe fn vqrshl_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshifts.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshl.v4i16")]
+        fn vqrshl_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t;
+    }
+vqrshl_s16_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s16_p16(a: poly16x8_t) -> int16x8_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshl))]
+pub unsafe fn vqrshlq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshifts.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshl.v8i16")]
+        fn vqrshlq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
+    }
+vqrshlq_s16_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s16_u16(a: uint16x8_t) -> int16x8_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshl))]
+pub unsafe fn vqrshl_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshifts.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshl.v2i32")]
+        fn vqrshl_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t;
+    }
+vqrshl_s32_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s32_u32(a: uint32x4_t) -> int32x4_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshl))]
+pub unsafe fn vqrshlq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshifts.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshl.v4i32")]
+        fn vqrshlq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+    }
+vqrshlq_s32_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s64_u64(a: uint64x2_t) -> int64x2_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshl))]
+pub unsafe fn vqrshl_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshifts.v1i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshl.v1i64")]
+        fn vqrshl_s64_(a: int64x1_t, b: int64x1_t) -> int64x1_t;
+    }
+vqrshl_s64_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u8_p8(a: poly8x8_t) -> uint8x8_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshl))]
+pub unsafe fn vqrshlq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshifts.v2i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshl.v2i64")]
+        fn vqrshlq_s64_(a: int64x2_t, b: int64x2_t) -> int64x2_t;
+    }
+vqrshlq_s64_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned signed saturating rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u8_s8(a: int8x8_t) -> uint8x8_t {
-    transmute(a)
-}
-
-/// Vector reinterpret cast operation
-#[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u16_p16(a: poly16x4_t) -> uint16x4_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqrshl))]
+pub unsafe fn vqrshl_u8(a: uint8x8_t, b: int8x8_t) -> uint8x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftu.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshl.v8i8")]
+        fn vqrshl_u8_(a: uint8x8_t, b: int8x8_t) -> uint8x8_t;
+    }
+vqrshl_u8_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned signed saturating rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u16_s16(a: int16x4_t) -> uint16x4_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqrshl))]
+pub unsafe fn vqrshlq_u8(a: uint8x16_t, b: int8x16_t) -> uint8x16_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftu.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshl.v16i8")]
+        fn vqrshlq_u8_(a: uint8x16_t, b: int8x16_t) -> uint8x16_t;
+    }
+vqrshlq_u8_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned signed saturating rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u32_s32(a: int32x2_t) -> uint32x2_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqrshl))]
+pub unsafe fn vqrshl_u16(a: uint16x4_t, b: int16x4_t) -> uint16x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftu.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshl.v4i16")]
+        fn vqrshl_u16_(a: uint16x4_t, b: int16x4_t) -> uint16x4_t;
+    }
+vqrshl_u16_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned signed saturating rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u64_s64(a: int64x1_t) -> uint64x1_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqrshl))]
+pub unsafe fn vqrshlq_u16(a: uint16x8_t, b: int16x8_t) -> uint16x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftu.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshl.v8i16")]
+        fn vqrshlq_u16_(a: uint16x8_t, b: int16x8_t) -> uint16x8_t;
+    }
+vqrshlq_u16_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned signed saturating rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u8_p8(a: poly8x16_t) -> uint8x16_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqrshl))]
+pub unsafe fn vqrshl_u32(a: uint32x2_t, b: int32x2_t) -> uint32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftu.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshl.v2i32")]
+        fn vqrshl_u32_(a: uint32x2_t, b: int32x2_t) -> uint32x2_t;
+    }
+vqrshl_u32_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned signed saturating rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u8_s8(a: int8x16_t) -> uint8x16_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqrshl))]
+pub unsafe fn vqrshlq_u32(a: uint32x4_t, b: int32x4_t) -> uint32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftu.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshl.v4i32")]
+        fn vqrshlq_u32_(a: uint32x4_t, b: int32x4_t) -> uint32x4_t;
+    }
+vqrshlq_u32_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned signed saturating rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u16_p16(a: poly16x8_t) -> uint16x8_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqrshl))]
+pub unsafe fn vqrshl_u64(a: uint64x1_t, b: int64x1_t) -> uint64x1_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftu.v1i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshl.v1i64")]
+        fn vqrshl_u64_(a: uint64x1_t, b: int64x1_t) -> uint64x1_t;
+    }
+vqrshl_u64_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned signed saturating rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u16_s16(a: int16x8_t) -> uint16x8_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqrshl))]
+pub unsafe fn vqrshlq_u64(a: uint64x2_t, b: int64x2_t) -> uint64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftu.v2i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshl.v2i64")]
+        fn vqrshlq_u64_(a: uint64x2_t, b: int64x2_t) -> uint64x2_t;
+    }
+vqrshlq_u64_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating rounded shift right narrow
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u32_s32(a: int32x4_t) -> uint32x4_t {
-    transmute(a)
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqrshrn_n_s16<const N: i32>(a: int16x8_t) -> int8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftns.v8i8")]
+        fn vqrshrn_n_s16_(a: int16x8_t, n: int16x8_t) -> int8x8_t;
+    }
+vqrshrn_n_s16_(a, int16x8_t(-N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16))
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating rounded shift right narrow
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u64_s64(a: int64x2_t) -> uint64x2_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqrshrn_n_s16<const N: i32>(a: int16x8_t) -> int8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshrn.v8i8")]
+        fn vqrshrn_n_s16_(a: int16x8_t, n: i32) -> int8x8_t;
+    }
+vqrshrn_n_s16_(a, N)
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating rounded shift right narrow
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_p8_s8(a: int8x8_t) -> poly8x8_t {
-    transmute(a)
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqrshrn_n_s32<const N: i32>(a: int32x4_t) -> int16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftns.v4i16")]
+        fn vqrshrn_n_s32_(a: int32x4_t, n: int32x4_t) -> int16x4_t;
+    }
+vqrshrn_n_s32_(a, int32x4_t(-N as i32, -N as i32, -N as i32, -N as i32))
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating rounded shift right narrow
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_p8_u8(a: uint8x8_t) -> poly8x8_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqrshrn_n_s32<const N: i32>(a: int32x4_t) -> int16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshrn.v4i16")]
+        fn vqrshrn_n_s32_(a: int32x4_t, n: i32) -> int16x4_t;
+    }
+vqrshrn_n_s32_(a, N)
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating rounded shift right narrow
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_p16_s16(a: int16x4_t) -> poly16x4_t {
-    transmute(a)
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqrshrn_n_s64<const N: i32>(a: int64x2_t) -> int32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftns.v2i32")]
+        fn vqrshrn_n_s64_(a: int64x2_t, n: int64x2_t) -> int32x2_t;
+    }
+vqrshrn_n_s64_(a, int64x2_t(-N as i64, -N as i64))
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating rounded shift right narrow
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_p16_u16(a: uint16x4_t) -> poly16x4_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqrshrn_n_s64<const N: i32>(a: int64x2_t) -> int32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshrn.v2i32")]
+        fn vqrshrn_n_s64_(a: int64x2_t, n: i32) -> int32x2_t;
+    }
+vqrshrn_n_s64_(a, N)
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned signed saturating rounded shift right narrow
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_p8_s8(a: int8x16_t) -> poly8x16_t {
-    transmute(a)
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqrshrn_n_u16<const N: i32>(a: uint16x8_t) -> uint8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftnu.v8i8")]
+        fn vqrshrn_n_u16_(a: uint16x8_t, n: uint16x8_t) -> uint8x8_t;
+    }
+vqrshrn_n_u16_(a, uint16x8_t(-N as u16, -N as u16, -N as u16, -N as u16, -N as u16, -N as u16, -N as u16, -N as u16))
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned signed saturating rounded shift right narrow
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_p8_u8(a: uint8x16_t) -> poly8x16_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqrshrn_n_u16<const N: i32>(a: uint16x8_t) -> uint8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshrn.v8i8")]
+        fn vqrshrn_n_u16_(a: uint16x8_t, n: i32) -> uint8x8_t;
+    }
+vqrshrn_n_u16_(a, N)
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned signed saturating rounded shift right narrow
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_p16_s16(a: int16x8_t) -> poly16x8_t {
-    transmute(a)
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqrshrn_n_u32<const N: i32>(a: uint32x4_t) -> uint16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftnu.v4i16")]
+        fn vqrshrn_n_u32_(a: uint32x4_t, n: uint32x4_t) -> uint16x4_t;
+    }
+vqrshrn_n_u32_(a, uint32x4_t(-N as u32, -N as u32, -N as u32, -N as u32))
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned signed saturating rounded shift right narrow
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_p16_u16(a: uint16x8_t) -> poly16x8_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqrshrn_n_u32<const N: i32>(a: uint32x4_t) -> uint16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshrn.v4i16")]
+        fn vqrshrn_n_u32_(a: uint32x4_t, n: i32) -> uint16x4_t;
+    }
+vqrshrn_n_u32_(a, N)
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned signed saturating rounded shift right narrow
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s8_s16(a: int16x4_t) -> int8x8_t {
-    transmute(a)
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqrshrn_n_u64<const N: i32>(a: uint64x2_t) -> uint32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftnu.v2i32")]
+        fn vqrshrn_n_u64_(a: uint64x2_t, n: uint64x2_t) -> uint32x2_t;
+    }
+vqrshrn_n_u64_(a, uint64x2_t(-N as u64, -N as u64))
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned signed saturating rounded shift right narrow
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s8_u16(a: uint16x4_t) -> int8x8_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqrshrn_n_u64<const N: i32>(a: uint64x2_t) -> uint32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshrn.v2i32")]
+        fn vqrshrn_n_u64_(a: uint64x2_t, n: i32) -> uint32x2_t;
+    }
+vqrshrn_n_u64_(a, N)
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating rounded shift right unsigned narrow
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s8_p16(a: poly16x4_t) -> int8x8_t {
-    transmute(a)
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqrshrun_n_s16<const N: i32>(a: int16x8_t) -> uint8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftnsu.v8i8")]
+        fn vqrshrun_n_s16_(a: int16x8_t, n: int16x8_t) -> uint8x8_t;
+    }
+vqrshrun_n_s16_(a, int16x8_t(-N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16))
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating rounded shift right unsigned narrow
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s16_s32(a: int32x2_t) -> int16x4_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqrshrun_n_s16<const N: i32>(a: int16x8_t) -> uint8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshrun.v8i8")]
+        fn vqrshrun_n_s16_(a: int16x8_t, n: i32) -> uint8x8_t;
+    }
+vqrshrun_n_s16_(a, N)
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating rounded shift right unsigned narrow
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s16_u32(a: uint32x2_t) -> int16x4_t {
-    transmute(a)
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqrshrun_n_s32<const N: i32>(a: int32x4_t) -> uint16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftnsu.v4i16")]
+        fn vqrshrun_n_s32_(a: int32x4_t, n: int32x4_t) -> uint16x4_t;
+    }
+vqrshrun_n_s32_(a, int32x4_t(-N as i32, -N as i32, -N as i32, -N as i32))
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating rounded shift right unsigned narrow
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s32_s64(a: int64x1_t) -> int32x2_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqrshrun_n_s32<const N: i32>(a: int32x4_t) -> uint16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshrun.v4i16")]
+        fn vqrshrun_n_s32_(a: int32x4_t, n: i32) -> uint16x4_t;
+    }
+vqrshrun_n_s32_(a, N)
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating rounded shift right unsigned narrow
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s32_u64(a: uint64x1_t) -> int32x2_t {
-    transmute(a)
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqrshrun_n_s64<const N: i32>(a: int64x2_t) -> uint32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftnsu.v2i32")]
+        fn vqrshrun_n_s64_(a: int64x2_t, n: int64x2_t) -> uint32x2_t;
+    }
+vqrshrun_n_s64_(a, int64x2_t(-N as i64, -N as i64))
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating rounded shift right unsigned narrow
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s8_s16(a: int16x8_t) -> int8x16_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqrshrun_n_s64<const N: i32>(a: int64x2_t) -> uint32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshrun.v2i32")]
+        fn vqrshrun_n_s64_(a: int64x2_t, n: i32) -> uint32x2_t;
+    }
+vqrshrun_n_s64_(a, N)
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s8_u16(a: uint16x8_t) -> int8x16_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl))]
+pub unsafe fn vqshl_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshifts.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshl.v8i8")]
+        fn vqshl_s8_(a: int8x8_t, b: int8x8_t) -> int8x8_t;
+    }
+vqshl_s8_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s8_p16(a: poly16x8_t) -> int8x16_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl))]
+pub unsafe fn vqshlq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshifts.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshl.v16i8")]
+        fn vqshlq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t;
+    }
+vqshlq_s8_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s16_s32(a: int32x4_t) -> int16x8_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl))]
+pub unsafe fn vqshl_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshifts.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshl.v4i16")]
+        fn vqshl_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t;
+    }
+vqshl_s16_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s16_u32(a: uint32x4_t) -> int16x8_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl))]
+pub unsafe fn vqshlq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshifts.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshl.v8i16")]
+        fn vqshlq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
+    }
+vqshlq_s16_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s32_s64(a: int64x2_t) -> int32x4_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl))]
+pub unsafe fn vqshl_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshifts.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshl.v2i32")]
+        fn vqshl_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t;
+    }
+vqshl_s32_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s32_u64(a: uint64x2_t) -> int32x4_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl))]
+pub unsafe fn vqshlq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshifts.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshl.v4i32")]
+        fn vqshlq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+    }
+vqshlq_s32_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u8_p16(a: poly16x4_t) -> uint8x8_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl))]
+pub unsafe fn vqshl_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshifts.v1i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshl.v1i64")]
+        fn vqshl_s64_(a: int64x1_t, b: int64x1_t) -> int64x1_t;
+    }
+vqshl_s64_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u8_s16(a: int16x4_t) -> uint8x8_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl))]
+pub unsafe fn vqshlq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshifts.v2i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshl.v2i64")]
+        fn vqshlq_s64_(a: int64x2_t, b: int64x2_t) -> int64x2_t;
+    }
+vqshlq_s64_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u8_u16(a: uint16x4_t) -> uint8x8_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl))]
+pub unsafe fn vqshl_u8(a: uint8x8_t, b: int8x8_t) -> uint8x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftu.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshl.v8i8")]
+        fn vqshl_u8_(a: uint8x8_t, b: int8x8_t) -> uint8x8_t;
+    }
+vqshl_u8_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u16_s32(a: int32x2_t) -> uint16x4_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl))]
+pub unsafe fn vqshlq_u8(a: uint8x16_t, b: int8x16_t) -> uint8x16_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftu.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshl.v16i8")]
+        fn vqshlq_u8_(a: uint8x16_t, b: int8x16_t) -> uint8x16_t;
+    }
+vqshlq_u8_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u16_u32(a: uint32x2_t) -> uint16x4_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl))]
+pub unsafe fn vqshl_u16(a: uint16x4_t, b: int16x4_t) -> uint16x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftu.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshl.v4i16")]
+        fn vqshl_u16_(a: uint16x4_t, b: int16x4_t) -> uint16x4_t;
+    }
+vqshl_u16_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u32_s64(a: int64x1_t) -> uint32x2_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl))]
+pub unsafe fn vqshlq_u16(a: uint16x8_t, b: int16x8_t) -> uint16x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftu.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshl.v8i16")]
+        fn vqshlq_u16_(a: uint16x8_t, b: int16x8_t) -> uint16x8_t;
+    }
+vqshlq_u16_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u32_u64(a: uint64x1_t) -> uint32x2_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl))]
+pub unsafe fn vqshl_u32(a: uint32x2_t, b: int32x2_t) -> uint32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftu.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshl.v2i32")]
+        fn vqshl_u32_(a: uint32x2_t, b: int32x2_t) -> uint32x2_t;
+    }
+vqshl_u32_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u8_p16(a: poly16x8_t) -> uint8x16_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl))]
+pub unsafe fn vqshlq_u32(a: uint32x4_t, b: int32x4_t) -> uint32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftu.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshl.v4i32")]
+        fn vqshlq_u32_(a: uint32x4_t, b: int32x4_t) -> uint32x4_t;
+    }
+vqshlq_u32_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u8_s16(a: int16x8_t) -> uint8x16_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl))]
+pub unsafe fn vqshl_u64(a: uint64x1_t, b: int64x1_t) -> uint64x1_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftu.v1i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshl.v1i64")]
+        fn vqshl_u64_(a: uint64x1_t, b: int64x1_t) -> uint64x1_t;
+    }
+vqshl_u64_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u8_u16(a: uint16x8_t) -> uint8x16_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl))]
+pub unsafe fn vqshlq_u64(a: uint64x2_t, b: int64x2_t) -> uint64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftu.v2i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshl.v2i64")]
+        fn vqshlq_u64_(a: uint64x2_t, b: int64x2_t) -> uint64x2_t;
+    }
+vqshlq_u64_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u16_s32(a: int32x4_t) -> uint16x8_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshl_n_s8<const N: i32>(a: int8x8_t) -> int8x8_t {
+    static_assert_imm3!(N);
+    vqshl_s8(a, vdup_n_s8(N.try_into().unwrap()))
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u16_u32(a: uint32x4_t) -> uint16x8_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshlq_n_s8<const N: i32>(a: int8x16_t) -> int8x16_t {
+    static_assert_imm3!(N);
+    vqshlq_s8(a, vdupq_n_s8(N.try_into().unwrap()))
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u32_s64(a: int64x2_t) -> uint32x4_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshl_n_s16<const N: i32>(a: int16x4_t) -> int16x4_t {
+    static_assert_imm4!(N);
+    vqshl_s16(a, vdup_n_s16(N.try_into().unwrap()))
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u32_u64(a: uint64x2_t) -> uint32x4_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshlq_n_s16<const N: i32>(a: int16x8_t) -> int16x8_t {
+    static_assert_imm4!(N);
+    vqshlq_s16(a, vdupq_n_s16(N.try_into().unwrap()))
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_p8_p16(a: poly16x4_t) -> poly8x8_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshl_n_s32<const N: i32>(a: int32x2_t) -> int32x2_t {
+    static_assert_imm5!(N);
+    vqshl_s32(a, vdup_n_s32(N.try_into().unwrap()))
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_p8_s16(a: int16x4_t) -> poly8x8_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshlq_n_s32<const N: i32>(a: int32x4_t) -> int32x4_t {
+    static_assert_imm5!(N);
+    vqshlq_s32(a, vdupq_n_s32(N.try_into().unwrap()))
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_p8_u16(a: uint16x4_t) -> poly8x8_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshl_n_s64<const N: i32>(a: int64x1_t) -> int64x1_t {
+    static_assert_imm6!(N);
+    vqshl_s64(a, vdup_n_s64(N.try_into().unwrap()))
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_p16_s32(a: int32x2_t) -> poly16x4_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshlq_n_s64<const N: i32>(a: int64x2_t) -> int64x2_t {
+    static_assert_imm6!(N);
+    vqshlq_s64(a, vdupq_n_s64(N.try_into().unwrap()))
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_p16_u32(a: uint32x2_t) -> poly16x4_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshl_n_u8<const N: i32>(a: uint8x8_t) -> uint8x8_t {
+    static_assert_imm3!(N);
+    vqshl_u8(a, vdup_n_s8(N.try_into().unwrap()))
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_p8_p16(a: poly16x8_t) -> poly8x16_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshlq_n_u8<const N: i32>(a: uint8x16_t) -> uint8x16_t {
+    static_assert_imm3!(N);
+    vqshlq_u8(a, vdupq_n_s8(N.try_into().unwrap()))
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_p8_s16(a: int16x8_t) -> poly8x16_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshl_n_u16<const N: i32>(a: uint16x4_t) -> uint16x4_t {
+    static_assert_imm4!(N);
+    vqshl_u16(a, vdup_n_s16(N.try_into().unwrap()))
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_p8_u16(a: uint16x8_t) -> poly8x16_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshlq_n_u16<const N: i32>(a: uint16x8_t) -> uint16x8_t {
+    static_assert_imm4!(N);
+    vqshlq_u16(a, vdupq_n_s16(N.try_into().unwrap()))
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_p16_s32(a: int32x4_t) -> poly16x8_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshl_n_u32<const N: i32>(a: uint32x2_t) -> uint32x2_t {
+    static_assert_imm5!(N);
+    vqshl_u32(a, vdup_n_s32(N.try_into().unwrap()))
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_p16_u32(a: uint32x4_t) -> poly16x8_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshlq_n_u32<const N: i32>(a: uint32x4_t) -> uint32x4_t {
+    static_assert_imm5!(N);
+    vqshlq_u32(a, vdupq_n_s32(N.try_into().unwrap()))
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s16_p8(a: poly8x8_t) -> int16x4_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshl_n_u64<const N: i32>(a: uint64x1_t) -> uint64x1_t {
+    static_assert_imm6!(N);
+    vqshl_u64(a, vdup_n_s64(N.try_into().unwrap()))
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s16_s8(a: int8x8_t) -> int16x4_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshlq_n_u64<const N: i32>(a: uint64x2_t) -> uint64x2_t {
+    static_assert_imm6!(N);
+    vqshlq_u64(a, vdupq_n_s64(N.try_into().unwrap()))
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating shift right narrow
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s16_u8(a: uint8x8_t) -> int16x4_t {
-    transmute(a)
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshrn_n_s16<const N: i32>(a: int16x8_t) -> int8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftns.v8i8")]
+        fn vqshrn_n_s16_(a: int16x8_t, n: int16x8_t) -> int8x8_t;
+    }
+vqshrn_n_s16_(a, int16x8_t(-N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16))
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating shift right narrow
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s32_p16(a: poly16x4_t) -> int32x2_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshrn_n_s16<const N: i32>(a: int16x8_t) -> int8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshrn.v8i8")]
+        fn vqshrn_n_s16_(a: int16x8_t, n: i32) -> int8x8_t;
+    }
+vqshrn_n_s16_(a, N)
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating shift right narrow
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s32_s16(a: int16x4_t) -> int32x2_t {
-    transmute(a)
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshrn_n_s32<const N: i32>(a: int32x4_t) -> int16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftns.v4i16")]
+        fn vqshrn_n_s32_(a: int32x4_t, n: int32x4_t) -> int16x4_t;
+    }
+vqshrn_n_s32_(a, int32x4_t(-N as i32, -N as i32, -N as i32, -N as i32))
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating shift right narrow
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s32_u16(a: uint16x4_t) -> int32x2_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshrn_n_s32<const N: i32>(a: int32x4_t) -> int16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshrn.v4i16")]
+        fn vqshrn_n_s32_(a: int32x4_t, n: i32) -> int16x4_t;
+    }
+vqshrn_n_s32_(a, N)
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating shift right narrow
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s64_s32(a: int32x2_t) -> int64x1_t {
-    transmute(a)
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshrn_n_s64<const N: i32>(a: int64x2_t) -> int32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftns.v2i32")]
+        fn vqshrn_n_s64_(a: int64x2_t, n: int64x2_t) -> int32x2_t;
+    }
+vqshrn_n_s64_(a, int64x2_t(-N as i64, -N as i64))
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating shift right narrow
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s64_u32(a: uint32x2_t) -> int64x1_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshrn_n_s64<const N: i32>(a: int64x2_t) -> int32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshrn.v2i32")]
+        fn vqshrn_n_s64_(a: int64x2_t, n: i32) -> int32x2_t;
+    }
+vqshrn_n_s64_(a, N)
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned saturating shift right narrow
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s16_p8(a: poly8x16_t) -> int16x8_t {
-    transmute(a)
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshrn_n_u16<const N: i32>(a: uint16x8_t) -> uint8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftnu.v8i8")]
+        fn vqshrn_n_u16_(a: uint16x8_t, n: uint16x8_t) -> uint8x8_t;
+    }
+vqshrn_n_u16_(a, uint16x8_t(-N as u16, -N as u16, -N as u16, -N as u16, -N as u16, -N as u16, -N as u16, -N as u16))
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned saturating shift right narrow
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s16_s8(a: int8x16_t) -> int16x8_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshrn_n_u16<const N: i32>(a: uint16x8_t) -> uint8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshrn.v8i8")]
+        fn vqshrn_n_u16_(a: uint16x8_t, n: i32) -> uint8x8_t;
+    }
+vqshrn_n_u16_(a, N)
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned saturating shift right narrow
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s16_u8(a: uint8x16_t) -> int16x8_t {
-    transmute(a)
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshrn_n_u32<const N: i32>(a: uint32x4_t) -> uint16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftnu.v4i16")]
+        fn vqshrn_n_u32_(a: uint32x4_t, n: uint32x4_t) -> uint16x4_t;
+    }
+vqshrn_n_u32_(a, uint32x4_t(-N as u32, -N as u32, -N as u32, -N as u32))
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned saturating shift right narrow
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s32_p16(a: poly16x8_t) -> int32x4_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshrn_n_u32<const N: i32>(a: uint32x4_t) -> uint16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshrn.v4i16")]
+        fn vqshrn_n_u32_(a: uint32x4_t, n: i32) -> uint16x4_t;
+    }
+vqshrn_n_u32_(a, N)
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned saturating shift right narrow
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s32_s16(a: int16x8_t) -> int32x4_t {
-    transmute(a)
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshrn_n_u64<const N: i32>(a: uint64x2_t) -> uint32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftnu.v2i32")]
+        fn vqshrn_n_u64_(a: uint64x2_t, n: uint64x2_t) -> uint32x2_t;
+    }
+vqshrn_n_u64_(a, uint64x2_t(-N as u64, -N as u64))
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned saturating shift right narrow
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s32_u16(a: uint16x8_t) -> int32x4_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshrn_n_u64<const N: i32>(a: uint64x2_t) -> uint32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshrn.v2i32")]
+        fn vqshrn_n_u64_(a: uint64x2_t, n: i32) -> uint32x2_t;
+    }
+vqshrn_n_u64_(a, N)
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating shift right unsigned narrow
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s64_s32(a: int32x4_t) -> int64x2_t {
-    transmute(a)
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshrun_n_s16<const N: i32>(a: int16x8_t) -> uint8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftnsu.v8i8")]
+        fn vqshrun_n_s16_(a: int16x8_t, n: int16x8_t) -> uint8x8_t;
+    }
+vqshrun_n_s16_(a, int16x8_t(-N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16))
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating shift right unsigned narrow
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s64_u32(a: uint32x4_t) -> int64x2_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshrun_n_s16<const N: i32>(a: int16x8_t) -> uint8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshrun.v8i8")]
+        fn vqshrun_n_s16_(a: int16x8_t, n: i32) -> uint8x8_t;
+    }
+vqshrun_n_s16_(a, N)
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating shift right unsigned narrow
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u16_p8(a: poly8x8_t) -> uint16x4_t {
-    transmute(a)
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshrun_n_s32<const N: i32>(a: int32x4_t) -> uint16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftnsu.v4i16")]
+        fn vqshrun_n_s32_(a: int32x4_t, n: int32x4_t) -> uint16x4_t;
+    }
+vqshrun_n_s32_(a, int32x4_t(-N as i32, -N as i32, -N as i32, -N as i32))
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating shift right unsigned narrow
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u16_s8(a: int8x8_t) -> uint16x4_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshrun_n_s32<const N: i32>(a: int32x4_t) -> uint16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshrun.v4i16")]
+        fn vqshrun_n_s32_(a: int32x4_t, n: i32) -> uint16x4_t;
+    }
+vqshrun_n_s32_(a, N)
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating shift right unsigned narrow
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshrun_n_s64<const N: i32>(a: int64x2_t) -> uint32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftnsu.v2i32")]
+        fn vqshrun_n_s64_(a: int64x2_t, n: int64x2_t) -> uint32x2_t;
+    }
+vqshrun_n_s64_(a, int64x2_t(-N as i64, -N as i64))
+}
+
+/// Signed saturating shift right unsigned narrow
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u16_u8(a: uint8x8_t) -> uint16x4_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshrun_n_s64<const N: i32>(a: int64x2_t) -> uint32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshrun.v2i32")]
+        fn vqshrun_n_s64_(a: int64x2_t, n: i32) -> uint32x2_t;
+    }
+vqshrun_n_s64_(a, N)
 }
 
-/// Vector reinterpret cast operation
+/// Reciprocal square-root estimate.
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u32_p16(a: poly16x4_t) -> uint32x2_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsqrte))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(frsqrte))]
+pub unsafe fn vrsqrte_f32(a: float32x2_t) -> float32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrsqrte.v2f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frsqrte.v2f32")]
+        fn vrsqrte_f32_(a: float32x2_t) -> float32x2_t;
+    }
+vrsqrte_f32_(a)
 }
 
-/// Vector reinterpret cast operation
+/// Reciprocal square-root estimate.
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u32_s16(a: int16x4_t) -> uint32x2_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsqrte))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(frsqrte))]
+pub unsafe fn vrsqrteq_f32(a: float32x4_t) -> float32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrsqrte.v4f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frsqrte.v4f32")]
+        fn vrsqrteq_f32_(a: float32x4_t) -> float32x4_t;
+    }
+vrsqrteq_f32_(a)
 }
 
-/// Vector reinterpret cast operation
+/// Reciprocal estimate.
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u32_u16(a: uint16x4_t) -> uint32x2_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrecpe))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(frecpe))]
+pub unsafe fn vrecpe_f32(a: float32x2_t) -> float32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrecpe.v2f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frecpe.v2f32")]
+        fn vrecpe_f32_(a: float32x2_t) -> float32x2_t;
+    }
+vrecpe_f32_(a)
 }
 
-/// Vector reinterpret cast operation
+/// Reciprocal estimate.
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u64_s32(a: int32x2_t) -> uint64x1_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrecpe))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(frecpe))]
+pub unsafe fn vrecpeq_f32(a: float32x4_t) -> float32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrecpe.v4f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frecpe.v4f32")]
+        fn vrecpeq_f32_(a: float32x4_t) -> float32x4_t;
+    }
+vrecpeq_f32_(a)
 }
 
 /// Vector reinterpret cast operation
@@ -11918,7 +12188,7 @@ pub unsafe fn vreinterpret_u64_s32(a: int32x2_t) -> uint64x1_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u64_u32(a: uint32x2_t) -> uint64x1_t {
+pub unsafe fn vreinterpret_s8_u8(a: uint8x8_t) -> int8x8_t {
     transmute(a)
 }
 
@@ -11928,7 +12198,7 @@ pub unsafe fn vreinterpret_u64_u32(a: uint32x2_t) -> uint64x1_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u16_p8(a: poly8x16_t) -> uint16x8_t {
+pub unsafe fn vreinterpret_s8_p8(a: poly8x8_t) -> int8x8_t {
     transmute(a)
 }
 
@@ -11938,7 +12208,7 @@ pub unsafe fn vreinterpretq_u16_p8(a: poly8x16_t) -> uint16x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u16_s8(a: int8x16_t) -> uint16x8_t {
+pub unsafe fn vreinterpret_s16_p16(a: poly16x4_t) -> int16x4_t {
     transmute(a)
 }
 
@@ -11948,7 +12218,7 @@ pub unsafe fn vreinterpretq_u16_s8(a: int8x16_t) -> uint16x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u16_u8(a: uint8x16_t) -> uint16x8_t {
+pub unsafe fn vreinterpret_s16_u16(a: uint16x4_t) -> int16x4_t {
     transmute(a)
 }
 
@@ -11958,7 +12228,7 @@ pub unsafe fn vreinterpretq_u16_u8(a: uint8x16_t) -> uint16x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u32_p16(a: poly16x8_t) -> uint32x4_t {
+pub unsafe fn vreinterpret_s32_u32(a: uint32x2_t) -> int32x2_t {
     transmute(a)
 }
 
@@ -11968,7 +12238,7 @@ pub unsafe fn vreinterpretq_u32_p16(a: poly16x8_t) -> uint32x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u32_s16(a: int16x8_t) -> uint32x4_t {
+pub unsafe fn vreinterpret_s64_u64(a: uint64x1_t) -> int64x1_t {
     transmute(a)
 }
 
@@ -11978,7 +12248,7 @@ pub unsafe fn vreinterpretq_u32_s16(a: int16x8_t) -> uint32x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u32_u16(a: uint16x8_t) -> uint32x4_t {
+pub unsafe fn vreinterpretq_s8_u8(a: uint8x16_t) -> int8x16_t {
     transmute(a)
 }
 
@@ -11988,7 +12258,7 @@ pub unsafe fn vreinterpretq_u32_u16(a: uint16x8_t) -> uint32x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u64_s32(a: int32x4_t) -> uint64x2_t {
+pub unsafe fn vreinterpretq_s8_p8(a: poly8x16_t) -> int8x16_t {
     transmute(a)
 }
 
@@ -11998,7 +12268,7 @@ pub unsafe fn vreinterpretq_u64_s32(a: int32x4_t) -> uint64x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u64_u32(a: uint32x4_t) -> uint64x2_t {
+pub unsafe fn vreinterpretq_s16_p16(a: poly16x8_t) -> int16x8_t {
     transmute(a)
 }
 
@@ -12008,7 +12278,7 @@ pub unsafe fn vreinterpretq_u64_u32(a: uint32x4_t) -> uint64x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_p16_p8(a: poly8x8_t) -> poly16x4_t {
+pub unsafe fn vreinterpretq_s16_u16(a: uint16x8_t) -> int16x8_t {
     transmute(a)
 }
 
@@ -12018,7 +12288,7 @@ pub unsafe fn vreinterpret_p16_p8(a: poly8x8_t) -> poly16x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_p16_s8(a: int8x8_t) -> poly16x4_t {
+pub unsafe fn vreinterpretq_s32_u32(a: uint32x4_t) -> int32x4_t {
     transmute(a)
 }
 
@@ -12028,7 +12298,7 @@ pub unsafe fn vreinterpret_p16_s8(a: int8x8_t) -> poly16x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_p16_u8(a: uint8x8_t) -> poly16x4_t {
+pub unsafe fn vreinterpretq_s64_u64(a: uint64x2_t) -> int64x2_t {
     transmute(a)
 }
 
@@ -12038,7 +12308,7 @@ pub unsafe fn vreinterpret_p16_u8(a: uint8x8_t) -> poly16x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_p16_p8(a: poly8x16_t) -> poly16x8_t {
+pub unsafe fn vreinterpret_u8_p8(a: poly8x8_t) -> uint8x8_t {
     transmute(a)
 }
 
@@ -12048,7 +12318,7 @@ pub unsafe fn vreinterpretq_p16_p8(a: poly8x16_t) -> poly16x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_p16_s8(a: int8x16_t) -> poly16x8_t {
+pub unsafe fn vreinterpret_u8_s8(a: int8x8_t) -> uint8x8_t {
     transmute(a)
 }
 
@@ -12058,7 +12328,7 @@ pub unsafe fn vreinterpretq_p16_s8(a: int8x16_t) -> poly16x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_p16_u8(a: uint8x16_t) -> poly16x8_t {
+pub unsafe fn vreinterpret_u16_p16(a: poly16x4_t) -> uint16x4_t {
     transmute(a)
 }
 
@@ -12068,7 +12338,7 @@ pub unsafe fn vreinterpretq_p16_u8(a: uint8x16_t) -> poly16x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s8_s32(a: int32x2_t) -> int8x8_t {
+pub unsafe fn vreinterpret_u16_s16(a: int16x4_t) -> uint16x4_t {
     transmute(a)
 }
 
@@ -12078,7 +12348,7 @@ pub unsafe fn vreinterpret_s8_s32(a: int32x2_t) -> int8x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s8_u32(a: uint32x2_t) -> int8x8_t {
+pub unsafe fn vreinterpret_u32_s32(a: int32x2_t) -> uint32x2_t {
     transmute(a)
 }
 
@@ -12088,7 +12358,7 @@ pub unsafe fn vreinterpret_s8_u32(a: uint32x2_t) -> int8x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s16_s64(a: int64x1_t) -> int16x4_t {
+pub unsafe fn vreinterpret_u64_s64(a: int64x1_t) -> uint64x1_t {
     transmute(a)
 }
 
@@ -12098,7 +12368,7 @@ pub unsafe fn vreinterpret_s16_s64(a: int64x1_t) -> int16x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s16_u64(a: uint64x1_t) -> int16x4_t {
+pub unsafe fn vreinterpretq_u8_p8(a: poly8x16_t) -> uint8x16_t {
     transmute(a)
 }
 
@@ -12108,7 +12378,7 @@ pub unsafe fn vreinterpret_s16_u64(a: uint64x1_t) -> int16x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s8_s32(a: int32x4_t) -> int8x16_t {
+pub unsafe fn vreinterpretq_u8_s8(a: int8x16_t) -> uint8x16_t {
     transmute(a)
 }
 
@@ -12118,7 +12388,7 @@ pub unsafe fn vreinterpretq_s8_s32(a: int32x4_t) -> int8x16_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s8_u32(a: uint32x4_t) -> int8x16_t {
+pub unsafe fn vreinterpretq_u16_p16(a: poly16x8_t) -> uint16x8_t {
     transmute(a)
 }
 
@@ -12128,7 +12398,7 @@ pub unsafe fn vreinterpretq_s8_u32(a: uint32x4_t) -> int8x16_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s16_s64(a: int64x2_t) -> int16x8_t {
+pub unsafe fn vreinterpretq_u16_s16(a: int16x8_t) -> uint16x8_t {
     transmute(a)
 }
 
@@ -12138,7 +12408,7 @@ pub unsafe fn vreinterpretq_s16_s64(a: int64x2_t) -> int16x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s16_u64(a: uint64x2_t) -> int16x8_t {
+pub unsafe fn vreinterpretq_u32_s32(a: int32x4_t) -> uint32x4_t {
     transmute(a)
 }
 
@@ -12148,7 +12418,7 @@ pub unsafe fn vreinterpretq_s16_u64(a: uint64x2_t) -> int16x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u8_s32(a: int32x2_t) -> uint8x8_t {
+pub unsafe fn vreinterpretq_u64_s64(a: int64x2_t) -> uint64x2_t {
     transmute(a)
 }
 
@@ -12158,7 +12428,7 @@ pub unsafe fn vreinterpret_u8_s32(a: int32x2_t) -> uint8x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u8_u32(a: uint32x2_t) -> uint8x8_t {
+pub unsafe fn vreinterpret_p8_s8(a: int8x8_t) -> poly8x8_t {
     transmute(a)
 }
 
@@ -12168,7 +12438,7 @@ pub unsafe fn vreinterpret_u8_u32(a: uint32x2_t) -> uint8x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u16_s64(a: int64x1_t) -> uint16x4_t {
+pub unsafe fn vreinterpret_p8_u8(a: uint8x8_t) -> poly8x8_t {
     transmute(a)
 }
 
@@ -12178,7 +12448,7 @@ pub unsafe fn vreinterpret_u16_s64(a: int64x1_t) -> uint16x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u16_u64(a: uint64x1_t) -> uint16x4_t {
+pub unsafe fn vreinterpret_p16_s16(a: int16x4_t) -> poly16x4_t {
     transmute(a)
 }
 
@@ -12188,7 +12458,7 @@ pub unsafe fn vreinterpret_u16_u64(a: uint64x1_t) -> uint16x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u8_s32(a: int32x4_t) -> uint8x16_t {
+pub unsafe fn vreinterpret_p16_u16(a: uint16x4_t) -> poly16x4_t {
     transmute(a)
 }
 
@@ -12198,7 +12468,7 @@ pub unsafe fn vreinterpretq_u8_s32(a: int32x4_t) -> uint8x16_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u8_u32(a: uint32x4_t) -> uint8x16_t {
+pub unsafe fn vreinterpretq_p8_s8(a: int8x16_t) -> poly8x16_t {
     transmute(a)
 }
 
@@ -12208,7 +12478,7 @@ pub unsafe fn vreinterpretq_u8_u32(a: uint32x4_t) -> uint8x16_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u16_s64(a: int64x2_t) -> uint16x8_t {
+pub unsafe fn vreinterpretq_p8_u8(a: uint8x16_t) -> poly8x16_t {
     transmute(a)
 }
 
@@ -12218,7 +12488,7 @@ pub unsafe fn vreinterpretq_u16_s64(a: int64x2_t) -> uint16x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u16_u64(a: uint64x2_t) -> uint16x8_t {
+pub unsafe fn vreinterpretq_p16_s16(a: int16x8_t) -> poly16x8_t {
     transmute(a)
 }
 
@@ -12228,7 +12498,7 @@ pub unsafe fn vreinterpretq_u16_u64(a: uint64x2_t) -> uint16x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_p8_s32(a: int32x2_t) -> poly8x8_t {
+pub unsafe fn vreinterpretq_p16_u16(a: uint16x8_t) -> poly16x8_t {
     transmute(a)
 }
 
@@ -12238,7 +12508,7 @@ pub unsafe fn vreinterpret_p8_s32(a: int32x2_t) -> poly8x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_p8_u32(a: uint32x2_t) -> poly8x8_t {
+pub unsafe fn vreinterpret_s8_s16(a: int16x4_t) -> int8x8_t {
     transmute(a)
 }
 
@@ -12248,7 +12518,7 @@ pub unsafe fn vreinterpret_p8_u32(a: uint32x2_t) -> poly8x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_p16_s64(a: int64x1_t) -> poly16x4_t {
+pub unsafe fn vreinterpret_s8_u16(a: uint16x4_t) -> int8x8_t {
     transmute(a)
 }
 
@@ -12258,7 +12528,7 @@ pub unsafe fn vreinterpret_p16_s64(a: int64x1_t) -> poly16x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_p16_u64(a: uint64x1_t) -> poly16x4_t {
+pub unsafe fn vreinterpret_s8_p16(a: poly16x4_t) -> int8x8_t {
     transmute(a)
 }
 
@@ -12268,7 +12538,7 @@ pub unsafe fn vreinterpret_p16_u64(a: uint64x1_t) -> poly16x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_p8_s32(a: int32x4_t) -> poly8x16_t {
+pub unsafe fn vreinterpret_s16_s32(a: int32x2_t) -> int16x4_t {
     transmute(a)
 }
 
@@ -12278,7 +12548,7 @@ pub unsafe fn vreinterpretq_p8_s32(a: int32x4_t) -> poly8x16_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_p8_u32(a: uint32x4_t) -> poly8x16_t {
+pub unsafe fn vreinterpret_s16_u32(a: uint32x2_t) -> int16x4_t {
     transmute(a)
 }
 
@@ -12288,7 +12558,7 @@ pub unsafe fn vreinterpretq_p8_u32(a: uint32x4_t) -> poly8x16_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_p16_s64(a: int64x2_t) -> poly16x8_t {
+pub unsafe fn vreinterpret_s32_s64(a: int64x1_t) -> int32x2_t {
     transmute(a)
 }
 
@@ -12298,7 +12568,7 @@ pub unsafe fn vreinterpretq_p16_s64(a: int64x2_t) -> poly16x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_p16_u64(a: uint64x2_t) -> poly16x8_t {
+pub unsafe fn vreinterpret_s32_u64(a: uint64x1_t) -> int32x2_t {
     transmute(a)
 }
 
@@ -12308,7 +12578,7 @@ pub unsafe fn vreinterpretq_p16_u64(a: uint64x2_t) -> poly16x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s32_p8(a: poly8x8_t) -> int32x2_t {
+pub unsafe fn vreinterpretq_s8_s16(a: int16x8_t) -> int8x16_t {
     transmute(a)
 }
 
@@ -12318,7 +12588,7 @@ pub unsafe fn vreinterpret_s32_p8(a: poly8x8_t) -> int32x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s32_s8(a: int8x8_t) -> int32x2_t {
+pub unsafe fn vreinterpretq_s8_u16(a: uint16x8_t) -> int8x16_t {
     transmute(a)
 }
 
@@ -12328,7 +12598,7 @@ pub unsafe fn vreinterpret_s32_s8(a: int8x8_t) -> int32x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s32_u8(a: uint8x8_t) -> int32x2_t {
+pub unsafe fn vreinterpretq_s8_p16(a: poly16x8_t) -> int8x16_t {
     transmute(a)
 }
 
@@ -12338,7 +12608,7 @@ pub unsafe fn vreinterpret_s32_u8(a: uint8x8_t) -> int32x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s64_p16(a: poly16x4_t) -> int64x1_t {
+pub unsafe fn vreinterpretq_s16_s32(a: int32x4_t) -> int16x8_t {
     transmute(a)
 }
 
@@ -12348,7 +12618,7 @@ pub unsafe fn vreinterpret_s64_p16(a: poly16x4_t) -> int64x1_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s64_s16(a: int16x4_t) -> int64x1_t {
+pub unsafe fn vreinterpretq_s16_u32(a: uint32x4_t) -> int16x8_t {
     transmute(a)
 }
 
@@ -12358,7 +12628,7 @@ pub unsafe fn vreinterpret_s64_s16(a: int16x4_t) -> int64x1_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s64_u16(a: uint16x4_t) -> int64x1_t {
+pub unsafe fn vreinterpretq_s32_s64(a: int64x2_t) -> int32x4_t {
     transmute(a)
 }
 
@@ -12368,7 +12638,7 @@ pub unsafe fn vreinterpret_s64_u16(a: uint16x4_t) -> int64x1_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s32_p8(a: poly8x16_t) -> int32x4_t {
+pub unsafe fn vreinterpretq_s32_u64(a: uint64x2_t) -> int32x4_t {
     transmute(a)
 }
 
@@ -12378,7 +12648,7 @@ pub unsafe fn vreinterpretq_s32_p8(a: poly8x16_t) -> int32x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s32_s8(a: int8x16_t) -> int32x4_t {
+pub unsafe fn vreinterpret_u8_p16(a: poly16x4_t) -> uint8x8_t {
     transmute(a)
 }
 
@@ -12388,7 +12658,7 @@ pub unsafe fn vreinterpretq_s32_s8(a: int8x16_t) -> int32x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s32_u8(a: uint8x16_t) -> int32x4_t {
+pub unsafe fn vreinterpret_u8_s16(a: int16x4_t) -> uint8x8_t {
     transmute(a)
 }
 
@@ -12398,7 +12668,7 @@ pub unsafe fn vreinterpretq_s32_u8(a: uint8x16_t) -> int32x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s64_p16(a: poly16x8_t) -> int64x2_t {
+pub unsafe fn vreinterpret_u8_u16(a: uint16x4_t) -> uint8x8_t {
     transmute(a)
 }
 
@@ -12408,7 +12678,7 @@ pub unsafe fn vreinterpretq_s64_p16(a: poly16x8_t) -> int64x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s64_s16(a: int16x8_t) -> int64x2_t {
+pub unsafe fn vreinterpret_u16_s32(a: int32x2_t) -> uint16x4_t {
     transmute(a)
 }
 
@@ -12418,7 +12688,7 @@ pub unsafe fn vreinterpretq_s64_s16(a: int16x8_t) -> int64x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s64_u16(a: uint16x8_t) -> int64x2_t {
+pub unsafe fn vreinterpret_u16_u32(a: uint32x2_t) -> uint16x4_t {
     transmute(a)
 }
 
@@ -12428,7 +12698,7 @@ pub unsafe fn vreinterpretq_s64_u16(a: uint16x8_t) -> int64x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u32_p8(a: poly8x8_t) -> uint32x2_t {
+pub unsafe fn vreinterpret_u32_s64(a: int64x1_t) -> uint32x2_t {
     transmute(a)
 }
 
@@ -12438,7 +12708,7 @@ pub unsafe fn vreinterpret_u32_p8(a: poly8x8_t) -> uint32x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u32_s8(a: int8x8_t) -> uint32x2_t {
+pub unsafe fn vreinterpret_u32_u64(a: uint64x1_t) -> uint32x2_t {
     transmute(a)
 }
 
@@ -12448,7 +12718,7 @@ pub unsafe fn vreinterpret_u32_s8(a: int8x8_t) -> uint32x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u32_u8(a: uint8x8_t) -> uint32x2_t {
+pub unsafe fn vreinterpretq_u8_p16(a: poly16x8_t) -> uint8x16_t {
     transmute(a)
 }
 
@@ -12458,7 +12728,7 @@ pub unsafe fn vreinterpret_u32_u8(a: uint8x8_t) -> uint32x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u64_p16(a: poly16x4_t) -> uint64x1_t {
+pub unsafe fn vreinterpretq_u8_s16(a: int16x8_t) -> uint8x16_t {
     transmute(a)
 }
 
@@ -12468,7 +12738,7 @@ pub unsafe fn vreinterpret_u64_p16(a: poly16x4_t) -> uint64x1_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u64_s16(a: int16x4_t) -> uint64x1_t {
+pub unsafe fn vreinterpretq_u8_u16(a: uint16x8_t) -> uint8x16_t {
     transmute(a)
 }
 
@@ -12478,7 +12748,7 @@ pub unsafe fn vreinterpret_u64_s16(a: int16x4_t) -> uint64x1_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u64_u16(a: uint16x4_t) -> uint64x1_t {
+pub unsafe fn vreinterpretq_u16_s32(a: int32x4_t) -> uint16x8_t {
     transmute(a)
 }
 
@@ -12488,7 +12758,7 @@ pub unsafe fn vreinterpret_u64_u16(a: uint16x4_t) -> uint64x1_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u32_p8(a: poly8x16_t) -> uint32x4_t {
+pub unsafe fn vreinterpretq_u16_u32(a: uint32x4_t) -> uint16x8_t {
     transmute(a)
 }
 
@@ -12498,7 +12768,7 @@ pub unsafe fn vreinterpretq_u32_p8(a: poly8x16_t) -> uint32x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u32_s8(a: int8x16_t) -> uint32x4_t {
+pub unsafe fn vreinterpretq_u32_s64(a: int64x2_t) -> uint32x4_t {
     transmute(a)
 }
 
@@ -12508,7 +12778,7 @@ pub unsafe fn vreinterpretq_u32_s8(a: int8x16_t) -> uint32x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u32_u8(a: uint8x16_t) -> uint32x4_t {
+pub unsafe fn vreinterpretq_u32_u64(a: uint64x2_t) -> uint32x4_t {
     transmute(a)
 }
 
@@ -12518,7 +12788,7 @@ pub unsafe fn vreinterpretq_u32_u8(a: uint8x16_t) -> uint32x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u64_p16(a: poly16x8_t) -> uint64x2_t {
+pub unsafe fn vreinterpret_p8_p16(a: poly16x4_t) -> poly8x8_t {
     transmute(a)
 }
 
@@ -12528,7 +12798,7 @@ pub unsafe fn vreinterpretq_u64_p16(a: poly16x8_t) -> uint64x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u64_s16(a: int16x8_t) -> uint64x2_t {
+pub unsafe fn vreinterpret_p8_s16(a: int16x4_t) -> poly8x8_t {
     transmute(a)
 }
 
@@ -12538,7 +12808,7 @@ pub unsafe fn vreinterpretq_u64_s16(a: int16x8_t) -> uint64x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u64_u16(a: uint16x8_t) -> uint64x2_t {
+pub unsafe fn vreinterpret_p8_u16(a: uint16x4_t) -> poly8x8_t {
     transmute(a)
 }
 
@@ -12548,7 +12818,7 @@ pub unsafe fn vreinterpretq_u64_u16(a: uint16x8_t) -> uint64x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s8_s64(a: int64x1_t) -> int8x8_t {
+pub unsafe fn vreinterpret_p16_s32(a: int32x2_t) -> poly16x4_t {
     transmute(a)
 }
 
@@ -12558,7 +12828,7 @@ pub unsafe fn vreinterpret_s8_s64(a: int64x1_t) -> int8x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s8_u64(a: uint64x1_t) -> int8x8_t {
+pub unsafe fn vreinterpret_p16_u32(a: uint32x2_t) -> poly16x4_t {
     transmute(a)
 }
 
@@ -12568,7 +12838,7 @@ pub unsafe fn vreinterpret_s8_u64(a: uint64x1_t) -> int8x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u8_s64(a: int64x1_t) -> uint8x8_t {
+pub unsafe fn vreinterpretq_p8_p16(a: poly16x8_t) -> poly8x16_t {
     transmute(a)
 }
 
@@ -12578,7 +12848,7 @@ pub unsafe fn vreinterpret_u8_s64(a: int64x1_t) -> uint8x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u8_u64(a: uint64x1_t) -> uint8x8_t {
+pub unsafe fn vreinterpretq_p8_s16(a: int16x8_t) -> poly8x16_t {
     transmute(a)
 }
 
@@ -12588,7 +12858,7 @@ pub unsafe fn vreinterpret_u8_u64(a: uint64x1_t) -> uint8x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_p8_s64(a: int64x1_t) -> poly8x8_t {
+pub unsafe fn vreinterpretq_p8_u16(a: uint16x8_t) -> poly8x16_t {
     transmute(a)
 }
 
@@ -12598,7 +12868,7 @@ pub unsafe fn vreinterpret_p8_s64(a: int64x1_t) -> poly8x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_p8_u64(a: uint64x1_t) -> poly8x8_t {
+pub unsafe fn vreinterpretq_p16_s32(a: int32x4_t) -> poly16x8_t {
     transmute(a)
 }
 
@@ -12608,7 +12878,7 @@ pub unsafe fn vreinterpret_p8_u64(a: uint64x1_t) -> poly8x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s8_s64(a: int64x2_t) -> int8x16_t {
+pub unsafe fn vreinterpretq_p16_u32(a: uint32x4_t) -> poly16x8_t {
     transmute(a)
 }
 
@@ -12618,7 +12888,7 @@ pub unsafe fn vreinterpretq_s8_s64(a: int64x2_t) -> int8x16_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s8_u64(a: uint64x2_t) -> int8x16_t {
+pub unsafe fn vreinterpret_s16_p8(a: poly8x8_t) -> int16x4_t {
     transmute(a)
 }
 
@@ -12628,7 +12898,7 @@ pub unsafe fn vreinterpretq_s8_u64(a: uint64x2_t) -> int8x16_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u8_s64(a: int64x2_t) -> uint8x16_t {
+pub unsafe fn vreinterpret_s16_s8(a: int8x8_t) -> int16x4_t {
     transmute(a)
 }
 
@@ -12638,7 +12908,7 @@ pub unsafe fn vreinterpretq_u8_s64(a: int64x2_t) -> uint8x16_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u8_u64(a: uint64x2_t) -> uint8x16_t {
+pub unsafe fn vreinterpret_s16_u8(a: uint8x8_t) -> int16x4_t {
     transmute(a)
 }
 
@@ -12648,7 +12918,7 @@ pub unsafe fn vreinterpretq_u8_u64(a: uint64x2_t) -> uint8x16_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_p8_s64(a: int64x2_t) -> poly8x16_t {
+pub unsafe fn vreinterpret_s32_p16(a: poly16x4_t) -> int32x2_t {
     transmute(a)
 }
 
@@ -12658,7 +12928,7 @@ pub unsafe fn vreinterpretq_p8_s64(a: int64x2_t) -> poly8x16_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_p8_u64(a: uint64x2_t) -> poly8x16_t {
+pub unsafe fn vreinterpret_s32_s16(a: int16x4_t) -> int32x2_t {
     transmute(a)
 }
 
@@ -12668,7 +12938,7 @@ pub unsafe fn vreinterpretq_p8_u64(a: uint64x2_t) -> poly8x16_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s64_p8(a: poly8x8_t) -> int64x1_t {
+pub unsafe fn vreinterpret_s32_u16(a: uint16x4_t) -> int32x2_t {
     transmute(a)
 }
 
@@ -12678,7 +12948,7 @@ pub unsafe fn vreinterpret_s64_p8(a: poly8x8_t) -> int64x1_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s64_s8(a: int8x8_t) -> int64x1_t {
+pub unsafe fn vreinterpret_s64_s32(a: int32x2_t) -> int64x1_t {
     transmute(a)
 }
 
@@ -12688,7 +12958,7 @@ pub unsafe fn vreinterpret_s64_s8(a: int8x8_t) -> int64x1_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s64_u8(a: uint8x8_t) -> int64x1_t {
+pub unsafe fn vreinterpret_s64_u32(a: uint32x2_t) -> int64x1_t {
     transmute(a)
 }
 
@@ -12698,7 +12968,7 @@ pub unsafe fn vreinterpret_s64_u8(a: uint8x8_t) -> int64x1_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u64_p8(a: poly8x8_t) -> uint64x1_t {
+pub unsafe fn vreinterpretq_s16_p8(a: poly8x16_t) -> int16x8_t {
     transmute(a)
 }
 
@@ -12708,7 +12978,7 @@ pub unsafe fn vreinterpret_u64_p8(a: poly8x8_t) -> uint64x1_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u64_s8(a: int8x8_t) -> uint64x1_t {
+pub unsafe fn vreinterpretq_s16_s8(a: int8x16_t) -> int16x8_t {
     transmute(a)
 }
 
@@ -12718,7 +12988,7 @@ pub unsafe fn vreinterpret_u64_s8(a: int8x8_t) -> uint64x1_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u64_u8(a: uint8x8_t) -> uint64x1_t {
+pub unsafe fn vreinterpretq_s16_u8(a: uint8x16_t) -> int16x8_t {
     transmute(a)
 }
 
@@ -12728,7 +12998,7 @@ pub unsafe fn vreinterpret_u64_u8(a: uint8x8_t) -> uint64x1_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s64_p8(a: poly8x16_t) -> int64x2_t {
+pub unsafe fn vreinterpretq_s32_p16(a: poly16x8_t) -> int32x4_t {
     transmute(a)
 }
 
@@ -12738,7 +13008,7 @@ pub unsafe fn vreinterpretq_s64_p8(a: poly8x16_t) -> int64x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s64_s8(a: int8x16_t) -> int64x2_t {
+pub unsafe fn vreinterpretq_s32_s16(a: int16x8_t) -> int32x4_t {
     transmute(a)
 }
 
@@ -12748,7 +13018,7 @@ pub unsafe fn vreinterpretq_s64_s8(a: int8x16_t) -> int64x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s64_u8(a: uint8x16_t) -> int64x2_t {
+pub unsafe fn vreinterpretq_s32_u16(a: uint16x8_t) -> int32x4_t {
     transmute(a)
 }
 
@@ -12758,7 +13028,7 @@ pub unsafe fn vreinterpretq_s64_u8(a: uint8x16_t) -> int64x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u64_p8(a: poly8x16_t) -> uint64x2_t {
+pub unsafe fn vreinterpretq_s64_s32(a: int32x4_t) -> int64x2_t {
     transmute(a)
 }
 
@@ -12768,7 +13038,7 @@ pub unsafe fn vreinterpretq_u64_p8(a: poly8x16_t) -> uint64x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u64_s8(a: int8x16_t) -> uint64x2_t {
+pub unsafe fn vreinterpretq_s64_u32(a: uint32x4_t) -> int64x2_t {
     transmute(a)
 }
 
@@ -12778,7 +13048,7 @@ pub unsafe fn vreinterpretq_u64_s8(a: int8x16_t) -> uint64x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u64_u8(a: uint8x16_t) -> uint64x2_t {
+pub unsafe fn vreinterpret_u16_p8(a: poly8x8_t) -> uint16x4_t {
     transmute(a)
 }
 
@@ -12788,7 +13058,7 @@ pub unsafe fn vreinterpretq_u64_u8(a: uint8x16_t) -> uint64x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s8_f32(a: float32x2_t) -> int8x8_t {
+pub unsafe fn vreinterpret_u16_s8(a: int8x8_t) -> uint16x4_t {
     transmute(a)
 }
 
@@ -12798,7 +13068,7 @@ pub unsafe fn vreinterpret_s8_f32(a: float32x2_t) -> int8x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s16_f32(a: float32x2_t) -> int16x4_t {
+pub unsafe fn vreinterpret_u16_u8(a: uint8x8_t) -> uint16x4_t {
     transmute(a)
 }
 
@@ -12808,7 +13078,7 @@ pub unsafe fn vreinterpret_s16_f32(a: float32x2_t) -> int16x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s32_f32(a: float32x2_t) -> int32x2_t {
+pub unsafe fn vreinterpret_u32_p16(a: poly16x4_t) -> uint32x2_t {
     transmute(a)
 }
 
@@ -12818,7 +13088,7 @@ pub unsafe fn vreinterpret_s32_f32(a: float32x2_t) -> int32x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s64_f32(a: float32x2_t) -> int64x1_t {
+pub unsafe fn vreinterpret_u32_s16(a: int16x4_t) -> uint32x2_t {
     transmute(a)
 }
 
@@ -12828,7 +13098,7 @@ pub unsafe fn vreinterpret_s64_f32(a: float32x2_t) -> int64x1_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s8_f32(a: float32x4_t) -> int8x16_t {
+pub unsafe fn vreinterpret_u32_u16(a: uint16x4_t) -> uint32x2_t {
     transmute(a)
 }
 
@@ -12838,7 +13108,7 @@ pub unsafe fn vreinterpretq_s8_f32(a: float32x4_t) -> int8x16_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s16_f32(a: float32x4_t) -> int16x8_t {
+pub unsafe fn vreinterpret_u64_s32(a: int32x2_t) -> uint64x1_t {
     transmute(a)
 }
 
@@ -12848,7 +13118,7 @@ pub unsafe fn vreinterpretq_s16_f32(a: float32x4_t) -> int16x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s32_f32(a: float32x4_t) -> int32x4_t {
+pub unsafe fn vreinterpret_u64_u32(a: uint32x2_t) -> uint64x1_t {
     transmute(a)
 }
 
@@ -12858,7 +13128,7 @@ pub unsafe fn vreinterpretq_s32_f32(a: float32x4_t) -> int32x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s64_f32(a: float32x4_t) -> int64x2_t {
+pub unsafe fn vreinterpretq_u16_p8(a: poly8x16_t) -> uint16x8_t {
     transmute(a)
 }
 
@@ -12868,7 +13138,7 @@ pub unsafe fn vreinterpretq_s64_f32(a: float32x4_t) -> int64x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u8_f32(a: float32x2_t) -> uint8x8_t {
+pub unsafe fn vreinterpretq_u16_s8(a: int8x16_t) -> uint16x8_t {
     transmute(a)
 }
 
@@ -12878,7 +13148,7 @@ pub unsafe fn vreinterpret_u8_f32(a: float32x2_t) -> uint8x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u16_f32(a: float32x2_t) -> uint16x4_t {
+pub unsafe fn vreinterpretq_u16_u8(a: uint8x16_t) -> uint16x8_t {
     transmute(a)
 }
 
@@ -12888,7 +13158,7 @@ pub unsafe fn vreinterpret_u16_f32(a: float32x2_t) -> uint16x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u32_f32(a: float32x2_t) -> uint32x2_t {
+pub unsafe fn vreinterpretq_u32_p16(a: poly16x8_t) -> uint32x4_t {
     transmute(a)
 }
 
@@ -12898,7 +13168,7 @@ pub unsafe fn vreinterpret_u32_f32(a: float32x2_t) -> uint32x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u64_f32(a: float32x2_t) -> uint64x1_t {
+pub unsafe fn vreinterpretq_u32_s16(a: int16x8_t) -> uint32x4_t {
     transmute(a)
 }
 
@@ -12908,7 +13178,7 @@ pub unsafe fn vreinterpret_u64_f32(a: float32x2_t) -> uint64x1_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u8_f32(a: float32x4_t) -> uint8x16_t {
+pub unsafe fn vreinterpretq_u32_u16(a: uint16x8_t) -> uint32x4_t {
     transmute(a)
 }
 
@@ -12918,7 +13188,7 @@ pub unsafe fn vreinterpretq_u8_f32(a: float32x4_t) -> uint8x16_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u16_f32(a: float32x4_t) -> uint16x8_t {
+pub unsafe fn vreinterpretq_u64_s32(a: int32x4_t) -> uint64x2_t {
     transmute(a)
 }
 
@@ -12928,7 +13198,7 @@ pub unsafe fn vreinterpretq_u16_f32(a: float32x4_t) -> uint16x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u32_f32(a: float32x4_t) -> uint32x4_t {
+pub unsafe fn vreinterpretq_u64_u32(a: uint32x4_t) -> uint64x2_t {
     transmute(a)
 }
 
@@ -12938,7 +13208,7 @@ pub unsafe fn vreinterpretq_u32_f32(a: float32x4_t) -> uint32x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u64_f32(a: float32x4_t) -> uint64x2_t {
+pub unsafe fn vreinterpret_p16_p8(a: poly8x8_t) -> poly16x4_t {
     transmute(a)
 }
 
@@ -12948,7 +13218,7 @@ pub unsafe fn vreinterpretq_u64_f32(a: float32x4_t) -> uint64x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_p8_f32(a: float32x2_t) -> poly8x8_t {
+pub unsafe fn vreinterpret_p16_s8(a: int8x8_t) -> poly16x4_t {
     transmute(a)
 }
 
@@ -12958,7 +13228,7 @@ pub unsafe fn vreinterpret_p8_f32(a: float32x2_t) -> poly8x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_p16_f32(a: float32x2_t) -> poly16x4_t {
+pub unsafe fn vreinterpret_p16_u8(a: uint8x8_t) -> poly16x4_t {
     transmute(a)
 }
 
@@ -12968,7 +13238,7 @@ pub unsafe fn vreinterpret_p16_f32(a: float32x2_t) -> poly16x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_p8_f32(a: float32x4_t) -> poly8x16_t {
+pub unsafe fn vreinterpretq_p16_p8(a: poly8x16_t) -> poly16x8_t {
     transmute(a)
 }
 
@@ -12978,7 +13248,7 @@ pub unsafe fn vreinterpretq_p8_f32(a: float32x4_t) -> poly8x16_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_p16_f32(a: float32x4_t) -> poly16x8_t {
+pub unsafe fn vreinterpretq_p16_s8(a: int8x16_t) -> poly16x8_t {
     transmute(a)
 }
 
@@ -12988,7 +13258,7 @@ pub unsafe fn vreinterpretq_p16_f32(a: float32x4_t) -> poly16x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_f32_s8(a: int8x8_t) -> float32x2_t {
+pub unsafe fn vreinterpretq_p16_u8(a: uint8x16_t) -> poly16x8_t {
     transmute(a)
 }
 
@@ -12998,7 +13268,7 @@ pub unsafe fn vreinterpret_f32_s8(a: int8x8_t) -> float32x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_f32_s16(a: int16x4_t) -> float32x2_t {
+pub unsafe fn vreinterpret_s8_s32(a: int32x2_t) -> int8x8_t {
     transmute(a)
 }
 
@@ -13008,7 +13278,7 @@ pub unsafe fn vreinterpret_f32_s16(a: int16x4_t) -> float32x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_f32_s32(a: int32x2_t) -> float32x2_t {
+pub unsafe fn vreinterpret_s8_u32(a: uint32x2_t) -> int8x8_t {
     transmute(a)
 }
 
@@ -13018,7 +13288,7 @@ pub unsafe fn vreinterpret_f32_s32(a: int32x2_t) -> float32x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_f32_s64(a: int64x1_t) -> float32x2_t {
+pub unsafe fn vreinterpret_s16_s64(a: int64x1_t) -> int16x4_t {
     transmute(a)
 }
 
@@ -13028,7 +13298,7 @@ pub unsafe fn vreinterpret_f32_s64(a: int64x1_t) -> float32x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_f32_s8(a: int8x16_t) -> float32x4_t {
+pub unsafe fn vreinterpret_s16_u64(a: uint64x1_t) -> int16x4_t {
     transmute(a)
 }
 
@@ -13038,7 +13308,7 @@ pub unsafe fn vreinterpretq_f32_s8(a: int8x16_t) -> float32x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_f32_s16(a: int16x8_t) -> float32x4_t {
+pub unsafe fn vreinterpretq_s8_s32(a: int32x4_t) -> int8x16_t {
     transmute(a)
 }
 
@@ -13048,7 +13318,7 @@ pub unsafe fn vreinterpretq_f32_s16(a: int16x8_t) -> float32x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_f32_s32(a: int32x4_t) -> float32x4_t {
+pub unsafe fn vreinterpretq_s8_u32(a: uint32x4_t) -> int8x16_t {
     transmute(a)
 }
 
@@ -13058,7 +13328,7 @@ pub unsafe fn vreinterpretq_f32_s32(a: int32x4_t) -> float32x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_f32_s64(a: int64x2_t) -> float32x4_t {
+pub unsafe fn vreinterpretq_s16_s64(a: int64x2_t) -> int16x8_t {
     transmute(a)
 }
 
@@ -13068,7 +13338,7 @@ pub unsafe fn vreinterpretq_f32_s64(a: int64x2_t) -> float32x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_f32_u8(a: uint8x8_t) -> float32x2_t {
+pub unsafe fn vreinterpretq_s16_u64(a: uint64x2_t) -> int16x8_t {
     transmute(a)
 }
 
@@ -13078,7 +13348,7 @@ pub unsafe fn vreinterpret_f32_u8(a: uint8x8_t) -> float32x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_f32_u16(a: uint16x4_t) -> float32x2_t {
+pub unsafe fn vreinterpret_u8_s32(a: int32x2_t) -> uint8x8_t {
     transmute(a)
 }
 
@@ -13088,7 +13358,7 @@ pub unsafe fn vreinterpret_f32_u16(a: uint16x4_t) -> float32x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_f32_u32(a: uint32x2_t) -> float32x2_t {
+pub unsafe fn vreinterpret_u8_u32(a: uint32x2_t) -> uint8x8_t {
     transmute(a)
 }
 
@@ -13098,7 +13368,7 @@ pub unsafe fn vreinterpret_f32_u32(a: uint32x2_t) -> float32x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_f32_u64(a: uint64x1_t) -> float32x2_t {
+pub unsafe fn vreinterpret_u16_s64(a: int64x1_t) -> uint16x4_t {
     transmute(a)
 }
 
@@ -13108,7 +13378,7 @@ pub unsafe fn vreinterpret_f32_u64(a: uint64x1_t) -> float32x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_f32_u8(a: uint8x16_t) -> float32x4_t {
+pub unsafe fn vreinterpret_u16_u64(a: uint64x1_t) -> uint16x4_t {
     transmute(a)
 }
 
@@ -13118,7 +13388,7 @@ pub unsafe fn vreinterpretq_f32_u8(a: uint8x16_t) -> float32x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_f32_u16(a: uint16x8_t) -> float32x4_t {
+pub unsafe fn vreinterpretq_u8_s32(a: int32x4_t) -> uint8x16_t {
     transmute(a)
 }
 
@@ -13128,7 +13398,7 @@ pub unsafe fn vreinterpretq_f32_u16(a: uint16x8_t) -> float32x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_f32_u32(a: uint32x4_t) -> float32x4_t {
+pub unsafe fn vreinterpretq_u8_u32(a: uint32x4_t) -> uint8x16_t {
     transmute(a)
 }
 
@@ -13138,7 +13408,7 @@ pub unsafe fn vreinterpretq_f32_u32(a: uint32x4_t) -> float32x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_f32_u64(a: uint64x2_t) -> float32x4_t {
+pub unsafe fn vreinterpretq_u16_s64(a: int64x2_t) -> uint16x8_t {
     transmute(a)
 }
 
@@ -13148,7 +13418,7 @@ pub unsafe fn vreinterpretq_f32_u64(a: uint64x2_t) -> float32x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_f32_p8(a: poly8x8_t) -> float32x2_t {
+pub unsafe fn vreinterpretq_u16_u64(a: uint64x2_t) -> uint16x8_t {
     transmute(a)
 }
 
@@ -13158,7 +13428,7 @@ pub unsafe fn vreinterpret_f32_p8(a: poly8x8_t) -> float32x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_f32_p16(a: poly16x4_t) -> float32x2_t {
+pub unsafe fn vreinterpret_p8_s32(a: int32x2_t) -> poly8x8_t {
     transmute(a)
 }
 
@@ -13168,7 +13438,7 @@ pub unsafe fn vreinterpret_f32_p16(a: poly16x4_t) -> float32x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_f32_p8(a: poly8x16_t) -> float32x4_t {
+pub unsafe fn vreinterpret_p8_u32(a: uint32x2_t) -> poly8x8_t {
     transmute(a)
 }
 
@@ -13178,7154 +13448,8678 @@ pub unsafe fn vreinterpretq_f32_p8(a: poly8x16_t) -> float32x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_f32_p16(a: poly16x8_t) -> float32x4_t {
+pub unsafe fn vreinterpret_p16_s64(a: int64x1_t) -> poly16x4_t {
     transmute(a)
 }
 
-/// Signed rounding shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshl))]
-pub unsafe fn vrshl_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshifts.v8i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srshl.v8i8")]
-        fn vrshl_s8_(a: int8x8_t, b: int8x8_t) -> int8x8_t;
-    }
-vrshl_s8_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_p16_u64(a: uint64x1_t) -> poly16x4_t {
+    transmute(a)
 }
 
-/// Signed rounding shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshl))]
-pub unsafe fn vrshlq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshifts.v16i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srshl.v16i8")]
-        fn vrshlq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t;
-    }
-vrshlq_s8_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_p8_s32(a: int32x4_t) -> poly8x16_t {
+    transmute(a)
 }
 
-/// Signed rounding shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshl))]
-pub unsafe fn vrshl_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshifts.v4i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srshl.v4i16")]
-        fn vrshl_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t;
-    }
-vrshl_s16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_p8_u32(a: uint32x4_t) -> poly8x16_t {
+    transmute(a)
 }
 
-/// Signed rounding shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshl))]
-pub unsafe fn vrshlq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshifts.v8i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srshl.v8i16")]
-        fn vrshlq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
-    }
-vrshlq_s16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_p16_s64(a: int64x2_t) -> poly16x8_t {
+    transmute(a)
 }
 
-/// Signed rounding shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshl))]
-pub unsafe fn vrshl_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshifts.v2i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srshl.v2i32")]
-        fn vrshl_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t;
-    }
-vrshl_s32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_p16_u64(a: uint64x2_t) -> poly16x8_t {
+    transmute(a)
 }
 
-/// Signed rounding shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshl))]
-pub unsafe fn vrshlq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshifts.v4i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srshl.v4i32")]
-        fn vrshlq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
-    }
-vrshlq_s32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_s32_p8(a: poly8x8_t) -> int32x2_t {
+    transmute(a)
 }
 
-/// Signed rounding shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshl))]
-pub unsafe fn vrshl_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshifts.v1i64")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srshl.v1i64")]
-        fn vrshl_s64_(a: int64x1_t, b: int64x1_t) -> int64x1_t;
-    }
-vrshl_s64_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_s32_s8(a: int8x8_t) -> int32x2_t {
+    transmute(a)
 }
 
-/// Signed rounding shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshl))]
-pub unsafe fn vrshlq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshifts.v2i64")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srshl.v2i64")]
-        fn vrshlq_s64_(a: int64x2_t, b: int64x2_t) -> int64x2_t;
-    }
-vrshlq_s64_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_s32_u8(a: uint8x8_t) -> int32x2_t {
+    transmute(a)
 }
 
-/// Unsigned rounding shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshl))]
-pub unsafe fn vrshl_u8(a: uint8x8_t, b: int8x8_t) -> uint8x8_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftu.v8i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urshl.v8i8")]
-        fn vrshl_u8_(a: uint8x8_t, b: int8x8_t) -> uint8x8_t;
-    }
-vrshl_u8_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_s64_p16(a: poly16x4_t) -> int64x1_t {
+    transmute(a)
 }
 
-/// Unsigned rounding shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshl))]
-pub unsafe fn vrshlq_u8(a: uint8x16_t, b: int8x16_t) -> uint8x16_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftu.v16i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urshl.v16i8")]
-        fn vrshlq_u8_(a: uint8x16_t, b: int8x16_t) -> uint8x16_t;
-    }
-vrshlq_u8_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_s64_s16(a: int16x4_t) -> int64x1_t {
+    transmute(a)
 }
 
-/// Unsigned rounding shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshl))]
-pub unsafe fn vrshl_u16(a: uint16x4_t, b: int16x4_t) -> uint16x4_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftu.v4i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urshl.v4i16")]
-        fn vrshl_u16_(a: uint16x4_t, b: int16x4_t) -> uint16x4_t;
-    }
-vrshl_u16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_s64_u16(a: uint16x4_t) -> int64x1_t {
+    transmute(a)
 }
 
-/// Unsigned rounding shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshl))]
-pub unsafe fn vrshlq_u16(a: uint16x8_t, b: int16x8_t) -> uint16x8_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftu.v8i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urshl.v8i16")]
-        fn vrshlq_u16_(a: uint16x8_t, b: int16x8_t) -> uint16x8_t;
-    }
-vrshlq_u16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_s32_p8(a: poly8x16_t) -> int32x4_t {
+    transmute(a)
 }
 
-/// Unsigned rounding shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshl))]
-pub unsafe fn vrshl_u32(a: uint32x2_t, b: int32x2_t) -> uint32x2_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftu.v2i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urshl.v2i32")]
-        fn vrshl_u32_(a: uint32x2_t, b: int32x2_t) -> uint32x2_t;
-    }
-vrshl_u32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_s32_s8(a: int8x16_t) -> int32x4_t {
+    transmute(a)
 }
 
-/// Unsigned rounding shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshl))]
-pub unsafe fn vrshlq_u32(a: uint32x4_t, b: int32x4_t) -> uint32x4_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftu.v4i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urshl.v4i32")]
-        fn vrshlq_u32_(a: uint32x4_t, b: int32x4_t) -> uint32x4_t;
-    }
-vrshlq_u32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_s32_u8(a: uint8x16_t) -> int32x4_t {
+    transmute(a)
 }
 
-/// Unsigned rounding shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshl))]
-pub unsafe fn vrshl_u64(a: uint64x1_t, b: int64x1_t) -> uint64x1_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftu.v1i64")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urshl.v1i64")]
-        fn vrshl_u64_(a: uint64x1_t, b: int64x1_t) -> uint64x1_t;
-    }
-vrshl_u64_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_s64_p16(a: poly16x8_t) -> int64x2_t {
+    transmute(a)
 }
 
-/// Unsigned rounding shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshl))]
-pub unsafe fn vrshlq_u64(a: uint64x2_t, b: int64x2_t) -> uint64x2_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftu.v2i64")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urshl.v2i64")]
-        fn vrshlq_u64_(a: uint64x2_t, b: int64x2_t) -> uint64x2_t;
-    }
-vrshlq_u64_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_s64_s16(a: int16x8_t) -> int64x2_t {
+    transmute(a)
 }
 
-/// Signed rounding shift right
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshr, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vrshr_n_s8<const N: i32>(a: int8x8_t) -> int8x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
-    vrshl_s8(a, vdup_n_s8((-N).try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_s64_u16(a: uint16x8_t) -> int64x2_t {
+    transmute(a)
 }
 
-/// Signed rounding shift right
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshr, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vrshrq_n_s8<const N: i32>(a: int8x16_t) -> int8x16_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
-    vrshlq_s8(a, vdupq_n_s8((-N).try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_u32_p8(a: poly8x8_t) -> uint32x2_t {
+    transmute(a)
 }
 
-/// Signed rounding shift right
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshr, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vrshr_n_s16<const N: i32>(a: int16x4_t) -> int16x4_t {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
-    vrshl_s16(a, vdup_n_s16((-N).try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_u32_s8(a: int8x8_t) -> uint32x2_t {
+    transmute(a)
 }
 
-/// Signed rounding shift right
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshr, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vrshrq_n_s16<const N: i32>(a: int16x8_t) -> int16x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
-    vrshlq_s16(a, vdupq_n_s16((-N).try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_u32_u8(a: uint8x8_t) -> uint32x2_t {
+    transmute(a)
 }
 
-/// Signed rounding shift right
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshr, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vrshr_n_s32<const N: i32>(a: int32x2_t) -> int32x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
-    vrshl_s32(a, vdup_n_s32((-N).try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_u64_p16(a: poly16x4_t) -> uint64x1_t {
+    transmute(a)
 }
 
-/// Signed rounding shift right
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshr, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vrshrq_n_s32<const N: i32>(a: int32x4_t) -> int32x4_t {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
-    vrshlq_s32(a, vdupq_n_s32((-N).try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_u64_s16(a: int16x4_t) -> uint64x1_t {
+    transmute(a)
 }
 
-/// Signed rounding shift right
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshr, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vrshr_n_s64<const N: i32>(a: int64x1_t) -> int64x1_t {
-    static_assert!(N : i32 where N >= 1 && N <= 64);
-    vrshl_s64(a, vdup_n_s64((-N).try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_u64_u16(a: uint16x4_t) -> uint64x1_t {
+    transmute(a)
 }
 
-/// Signed rounding shift right
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshr, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vrshrq_n_s64<const N: i32>(a: int64x2_t) -> int64x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 64);
-    vrshlq_s64(a, vdupq_n_s64((-N).try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_u32_p8(a: poly8x16_t) -> uint32x4_t {
+    transmute(a)
 }
 
-/// Unsigned rounding shift right
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshr, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vrshr_n_u8<const N: i32>(a: uint8x8_t) -> uint8x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
-    vrshl_u8(a, vdup_n_s8((-N).try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_u32_s8(a: int8x16_t) -> uint32x4_t {
+    transmute(a)
 }
 
-/// Unsigned rounding shift right
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshr, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vrshrq_n_u8<const N: i32>(a: uint8x16_t) -> uint8x16_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
-    vrshlq_u8(a, vdupq_n_s8((-N).try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_u32_u8(a: uint8x16_t) -> uint32x4_t {
+    transmute(a)
 }
 
-/// Unsigned rounding shift right
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshr, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vrshr_n_u16<const N: i32>(a: uint16x4_t) -> uint16x4_t {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
-    vrshl_u16(a, vdup_n_s16((-N).try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_u64_p16(a: poly16x8_t) -> uint64x2_t {
+    transmute(a)
 }
 
-/// Unsigned rounding shift right
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshr, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vrshrq_n_u16<const N: i32>(a: uint16x8_t) -> uint16x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
-    vrshlq_u16(a, vdupq_n_s16((-N).try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_u64_s16(a: int16x8_t) -> uint64x2_t {
+    transmute(a)
 }
 
-/// Unsigned rounding shift right
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshr, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vrshr_n_u32<const N: i32>(a: uint32x2_t) -> uint32x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
-    vrshl_u32(a, vdup_n_s32((-N).try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_u64_u16(a: uint16x8_t) -> uint64x2_t {
+    transmute(a)
 }
 
-/// Unsigned rounding shift right
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshr, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vrshrq_n_u32<const N: i32>(a: uint32x4_t) -> uint32x4_t {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
-    vrshlq_u32(a, vdupq_n_s32((-N).try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_s8_s64(a: int64x1_t) -> int8x8_t {
+    transmute(a)
 }
 
-/// Unsigned rounding shift right
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshr, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vrshr_n_u64<const N: i32>(a: uint64x1_t) -> uint64x1_t {
-    static_assert!(N : i32 where N >= 1 && N <= 64);
-    vrshl_u64(a, vdup_n_s64((-N).try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_s8_u64(a: uint64x1_t) -> int8x8_t {
+    transmute(a)
 }
 
-/// Unsigned rounding shift right
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshr, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vrshrq_n_u64<const N: i32>(a: uint64x2_t) -> uint64x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 64);
-    vrshlq_u64(a, vdupq_n_s64((-N).try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_u8_s64(a: int64x1_t) -> uint8x8_t {
+    transmute(a)
 }
 
-/// Rounding shift right narrow
+/// Vector reinterpret cast operation
 #[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vrshrn_n_s16<const N: i32>(a: int16x8_t) -> int8x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftn.v8i8")]
-        fn vrshrn_n_s16_(a: int16x8_t, n: int16x8_t) -> int8x8_t;
-    }
-vrshrn_n_s16_(a, int16x8_t(-N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16))
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_u8_u64(a: uint64x1_t) -> uint8x8_t {
+    transmute(a)
 }
 
-/// Rounding shift right narrow
+/// Vector reinterpret cast operation
 #[inline]
-#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vrshrn_n_s16<const N: i32>(a: int16x8_t) -> int8x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.rshrn.v8i8")]
-        fn vrshrn_n_s16_(a: int16x8_t, n: i32) -> int8x8_t;
-    }
-vrshrn_n_s16_(a, N)
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_p8_s64(a: int64x1_t) -> poly8x8_t {
+    transmute(a)
 }
 
-/// Rounding shift right narrow
+/// Vector reinterpret cast operation
 #[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vrshrn_n_s32<const N: i32>(a: int32x4_t) -> int16x4_t {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftn.v4i16")]
-        fn vrshrn_n_s32_(a: int32x4_t, n: int32x4_t) -> int16x4_t;
-    }
-vrshrn_n_s32_(a, int32x4_t(-N as i32, -N as i32, -N as i32, -N as i32))
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_p8_u64(a: uint64x1_t) -> poly8x8_t {
+    transmute(a)
 }
 
-/// Rounding shift right narrow
+/// Vector reinterpret cast operation
 #[inline]
-#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vrshrn_n_s32<const N: i32>(a: int32x4_t) -> int16x4_t {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.rshrn.v4i16")]
-        fn vrshrn_n_s32_(a: int32x4_t, n: i32) -> int16x4_t;
-    }
-vrshrn_n_s32_(a, N)
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_s8_s64(a: int64x2_t) -> int8x16_t {
+    transmute(a)
 }
 
-/// Rounding shift right narrow
+/// Vector reinterpret cast operation
 #[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vrshrn_n_s64<const N: i32>(a: int64x2_t) -> int32x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftn.v2i32")]
-        fn vrshrn_n_s64_(a: int64x2_t, n: int64x2_t) -> int32x2_t;
-    }
-vrshrn_n_s64_(a, int64x2_t(-N as i64, -N as i64))
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_s8_u64(a: uint64x2_t) -> int8x16_t {
+    transmute(a)
 }
 
-/// Rounding shift right narrow
+/// Vector reinterpret cast operation
 #[inline]
-#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vrshrn_n_s64<const N: i32>(a: int64x2_t) -> int32x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.rshrn.v2i32")]
-        fn vrshrn_n_s64_(a: int64x2_t, n: i32) -> int32x2_t;
-    }
-vrshrn_n_s64_(a, N)
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_u8_s64(a: int64x2_t) -> uint8x16_t {
+    transmute(a)
 }
 
-/// Rounding shift right narrow
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshrn, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vrshrn_n_u16<const N: i32>(a: uint16x8_t) -> uint8x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
-    transmute(vrshrn_n_s16::<N>(transmute(a)))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_u8_u64(a: uint64x2_t) -> uint8x16_t {
+    transmute(a)
 }
 
-/// Rounding shift right narrow
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshrn, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vrshrn_n_u32<const N: i32>(a: uint32x4_t) -> uint16x4_t {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
-    transmute(vrshrn_n_s32::<N>(transmute(a)))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_p8_s64(a: int64x2_t) -> poly8x16_t {
+    transmute(a)
 }
 
-/// Rounding shift right narrow
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshrn, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vrshrn_n_u64<const N: i32>(a: uint64x2_t) -> uint32x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
-    transmute(vrshrn_n_s64::<N>(transmute(a)))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_p8_u64(a: uint64x2_t) -> poly8x16_t {
+    transmute(a)
 }
 
-/// Signed rounding shift right and accumulate
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srsra, N = 2))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vrsra_n_s8<const N: i32>(a: int8x8_t, b: int8x8_t) -> int8x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
-    simd_add(a, vrshr_n_s8::<N>(b))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_s64_p8(a: poly8x8_t) -> int64x1_t {
+    transmute(a)
 }
 
-/// Signed rounding shift right and accumulate
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srsra, N = 2))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vrsraq_n_s8<const N: i32>(a: int8x16_t, b: int8x16_t) -> int8x16_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
-    simd_add(a, vrshrq_n_s8::<N>(b))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_s64_s8(a: int8x8_t) -> int64x1_t {
+    transmute(a)
 }
 
-/// Signed rounding shift right and accumulate
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srsra, N = 2))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vrsra_n_s16<const N: i32>(a: int16x4_t, b: int16x4_t) -> int16x4_t {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
-    simd_add(a, vrshr_n_s16::<N>(b))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_s64_u8(a: uint8x8_t) -> int64x1_t {
+    transmute(a)
 }
 
-/// Signed rounding shift right and accumulate
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srsra, N = 2))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vrsraq_n_s16<const N: i32>(a: int16x8_t, b: int16x8_t) -> int16x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
-    simd_add(a, vrshrq_n_s16::<N>(b))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_u64_p8(a: poly8x8_t) -> uint64x1_t {
+    transmute(a)
 }
 
-/// Signed rounding shift right and accumulate
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srsra, N = 2))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vrsra_n_s32<const N: i32>(a: int32x2_t, b: int32x2_t) -> int32x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
-    simd_add(a, vrshr_n_s32::<N>(b))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_u64_s8(a: int8x8_t) -> uint64x1_t {
+    transmute(a)
 }
 
-/// Signed rounding shift right and accumulate
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srsra, N = 2))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vrsraq_n_s32<const N: i32>(a: int32x4_t, b: int32x4_t) -> int32x4_t {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
-    simd_add(a, vrshrq_n_s32::<N>(b))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_u64_u8(a: uint8x8_t) -> uint64x1_t {
+    transmute(a)
 }
 
-/// Signed rounding shift right and accumulate
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srsra, N = 2))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vrsra_n_s64<const N: i32>(a: int64x1_t, b: int64x1_t) -> int64x1_t {
-    static_assert!(N : i32 where N >= 1 && N <= 64);
-    simd_add(a, vrshr_n_s64::<N>(b))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_s64_p8(a: poly8x16_t) -> int64x2_t {
+    transmute(a)
 }
 
-/// Signed rounding shift right and accumulate
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srsra, N = 2))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vrsraq_n_s64<const N: i32>(a: int64x2_t, b: int64x2_t) -> int64x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 64);
-    simd_add(a, vrshrq_n_s64::<N>(b))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_s64_s8(a: int8x16_t) -> int64x2_t {
+    transmute(a)
 }
 
-/// Unsigned rounding shift right and accumulate
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ursra, N = 2))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vrsra_n_u8<const N: i32>(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
-    simd_add(a, vrshr_n_u8::<N>(b))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_s64_u8(a: uint8x16_t) -> int64x2_t {
+    transmute(a)
 }
 
-/// Unsigned rounding shift right and accumulate
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ursra, N = 2))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vrsraq_n_u8<const N: i32>(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
-    simd_add(a, vrshrq_n_u8::<N>(b))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_u64_p8(a: poly8x16_t) -> uint64x2_t {
+    transmute(a)
 }
 
-/// Unsigned rounding shift right and accumulate
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ursra, N = 2))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vrsra_n_u16<const N: i32>(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
-    simd_add(a, vrshr_n_u16::<N>(b))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_u64_s8(a: int8x16_t) -> uint64x2_t {
+    transmute(a)
 }
 
-/// Unsigned rounding shift right and accumulate
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ursra, N = 2))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vrsraq_n_u16<const N: i32>(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
-    simd_add(a, vrshrq_n_u16::<N>(b))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_u64_u8(a: uint8x16_t) -> uint64x2_t {
+    transmute(a)
 }
 
-/// Unsigned rounding shift right and accumulate
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ursra, N = 2))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vrsra_n_u32<const N: i32>(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
-    simd_add(a, vrshr_n_u32::<N>(b))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_s8_f32(a: float32x2_t) -> int8x8_t {
+    transmute(a)
 }
 
-/// Unsigned rounding shift right and accumulate
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ursra, N = 2))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vrsraq_n_u32<const N: i32>(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
-    simd_add(a, vrshrq_n_u32::<N>(b))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_s16_f32(a: float32x2_t) -> int16x4_t {
+    transmute(a)
 }
 
-/// Unsigned rounding shift right and accumulate
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ursra, N = 2))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vrsra_n_u64<const N: i32>(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
-    static_assert!(N : i32 where N >= 1 && N <= 64);
-    simd_add(a, vrshr_n_u64::<N>(b))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_s32_f32(a: float32x2_t) -> int32x2_t {
+    transmute(a)
 }
 
-/// Unsigned rounding shift right and accumulate
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ursra, N = 2))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vrsraq_n_u64<const N: i32>(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 64);
-    simd_add(a, vrshrq_n_u64::<N>(b))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_s64_f32(a: float32x2_t) -> int64x1_t {
+    transmute(a)
 }
 
-/// Insert vector element from another vector element
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vset_lane_s8<const LANE: i32>(a: i8, b: int8x8_t) -> int8x8_t {
-    static_assert_imm3!(LANE);
-    simd_insert(b, LANE as u32, a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_s8_f32(a: float32x4_t) -> int8x16_t {
+    transmute(a)
 }
 
-/// Insert vector element from another vector element
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vset_lane_s16<const LANE: i32>(a: i16, b: int16x4_t) -> int16x4_t {
-    static_assert_imm2!(LANE);
-    simd_insert(b, LANE as u32, a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_s16_f32(a: float32x4_t) -> int16x8_t {
+    transmute(a)
 }
 
-/// Insert vector element from another vector element
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vset_lane_s32<const LANE: i32>(a: i32, b: int32x2_t) -> int32x2_t {
-    static_assert_imm1!(LANE);
-    simd_insert(b, LANE as u32, a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_s32_f32(a: float32x4_t) -> int32x4_t {
+    transmute(a)
 }
 
-/// Insert vector element from another vector element
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vset_lane_s64<const LANE: i32>(a: i64, b: int64x1_t) -> int64x1_t {
-    static_assert!(LANE : i32 where LANE == 0);
-    simd_insert(b, LANE as u32, a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_s64_f32(a: float32x4_t) -> int64x2_t {
+    transmute(a)
 }
 
-/// Insert vector element from another vector element
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vset_lane_u8<const LANE: i32>(a: u8, b: uint8x8_t) -> uint8x8_t {
-    static_assert_imm3!(LANE);
-    simd_insert(b, LANE as u32, a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_u8_f32(a: float32x2_t) -> uint8x8_t {
+    transmute(a)
 }
 
-/// Insert vector element from another vector element
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vset_lane_u16<const LANE: i32>(a: u16, b: uint16x4_t) -> uint16x4_t {
-    static_assert_imm2!(LANE);
-    simd_insert(b, LANE as u32, a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_u16_f32(a: float32x2_t) -> uint16x4_t {
+    transmute(a)
 }
 
-/// Insert vector element from another vector element
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vset_lane_u32<const LANE: i32>(a: u32, b: uint32x2_t) -> uint32x2_t {
-    static_assert_imm1!(LANE);
-    simd_insert(b, LANE as u32, a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_u32_f32(a: float32x2_t) -> uint32x2_t {
+    transmute(a)
 }
 
-/// Insert vector element from another vector element
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vset_lane_u64<const LANE: i32>(a: u64, b: uint64x1_t) -> uint64x1_t {
-    static_assert!(LANE : i32 where LANE == 0);
-    simd_insert(b, LANE as u32, a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_u64_f32(a: float32x2_t) -> uint64x1_t {
+    transmute(a)
 }
 
-/// Insert vector element from another vector element
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vset_lane_p8<const LANE: i32>(a: p8, b: poly8x8_t) -> poly8x8_t {
-    static_assert_imm3!(LANE);
-    simd_insert(b, LANE as u32, a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_u8_f32(a: float32x4_t) -> uint8x16_t {
+    transmute(a)
 }
 
-/// Insert vector element from another vector element
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vset_lane_p16<const LANE: i32>(a: p16, b: poly16x4_t) -> poly16x4_t {
-    static_assert_imm2!(LANE);
-    simd_insert(b, LANE as u32, a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_u16_f32(a: float32x4_t) -> uint16x8_t {
+    transmute(a)
 }
 
-/// Insert vector element from another vector element
+/// Vector reinterpret cast operation
 #[inline]
-#[target_feature(enable = "neon,aes")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "crypto,v8"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vset_lane_p64<const LANE: i32>(a: p64, b: poly64x1_t) -> poly64x1_t {
-    static_assert!(LANE : i32 where LANE == 0);
-    simd_insert(b, LANE as u32, a)
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_u32_f32(a: float32x4_t) -> uint32x4_t {
+    transmute(a)
 }
 
-/// Insert vector element from another vector element
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsetq_lane_s8<const LANE: i32>(a: i8, b: int8x16_t) -> int8x16_t {
-    static_assert_imm4!(LANE);
-    simd_insert(b, LANE as u32, a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_u64_f32(a: float32x4_t) -> uint64x2_t {
+    transmute(a)
 }
 
-/// Insert vector element from another vector element
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsetq_lane_s16<const LANE: i32>(a: i16, b: int16x8_t) -> int16x8_t {
-    static_assert_imm3!(LANE);
-    simd_insert(b, LANE as u32, a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_p8_f32(a: float32x2_t) -> poly8x8_t {
+    transmute(a)
 }
 
-/// Insert vector element from another vector element
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsetq_lane_s32<const LANE: i32>(a: i32, b: int32x4_t) -> int32x4_t {
-    static_assert_imm2!(LANE);
-    simd_insert(b, LANE as u32, a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_p16_f32(a: float32x2_t) -> poly16x4_t {
+    transmute(a)
 }
 
-/// Insert vector element from another vector element
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsetq_lane_s64<const LANE: i32>(a: i64, b: int64x2_t) -> int64x2_t {
-    static_assert_imm1!(LANE);
-    simd_insert(b, LANE as u32, a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_p8_f32(a: float32x4_t) -> poly8x16_t {
+    transmute(a)
 }
 
-/// Insert vector element from another vector element
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsetq_lane_u8<const LANE: i32>(a: u8, b: uint8x16_t) -> uint8x16_t {
-    static_assert_imm4!(LANE);
-    simd_insert(b, LANE as u32, a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_p16_f32(a: float32x4_t) -> poly16x8_t {
+    transmute(a)
 }
 
-/// Insert vector element from another vector element
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsetq_lane_u16<const LANE: i32>(a: u16, b: uint16x8_t) -> uint16x8_t {
-    static_assert_imm3!(LANE);
-    simd_insert(b, LANE as u32, a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_f32_s8(a: int8x8_t) -> float32x2_t {
+    transmute(a)
 }
 
-/// Insert vector element from another vector element
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsetq_lane_u32<const LANE: i32>(a: u32, b: uint32x4_t) -> uint32x4_t {
-    static_assert_imm2!(LANE);
-    simd_insert(b, LANE as u32, a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_f32_s16(a: int16x4_t) -> float32x2_t {
+    transmute(a)
 }
 
-/// Insert vector element from another vector element
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsetq_lane_u64<const LANE: i32>(a: u64, b: uint64x2_t) -> uint64x2_t {
-    static_assert_imm1!(LANE);
-    simd_insert(b, LANE as u32, a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_f32_s32(a: int32x2_t) -> float32x2_t {
+    transmute(a)
 }
 
-/// Insert vector element from another vector element
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsetq_lane_p8<const LANE: i32>(a: p8, b: poly8x16_t) -> poly8x16_t {
-    static_assert_imm4!(LANE);
-    simd_insert(b, LANE as u32, a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_f32_s64(a: int64x1_t) -> float32x2_t {
+    transmute(a)
 }
 
-/// Insert vector element from another vector element
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsetq_lane_p16<const LANE: i32>(a: p16, b: poly16x8_t) -> poly16x8_t {
-    static_assert_imm3!(LANE);
-    simd_insert(b, LANE as u32, a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_f32_s8(a: int8x16_t) -> float32x4_t {
+    transmute(a)
 }
 
-/// Insert vector element from another vector element
+/// Vector reinterpret cast operation
 #[inline]
-#[target_feature(enable = "neon,aes")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "crypto,v8"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsetq_lane_p64<const LANE: i32>(a: p64, b: poly64x2_t) -> poly64x2_t {
-    static_assert_imm1!(LANE);
-    simd_insert(b, LANE as u32, a)
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_f32_s16(a: int16x8_t) -> float32x4_t {
+    transmute(a)
 }
 
-/// Insert vector element from another vector element
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vset_lane_f32<const LANE: i32>(a: f32, b: float32x2_t) -> float32x2_t {
-    static_assert_imm1!(LANE);
-    simd_insert(b, LANE as u32, a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_f32_s32(a: int32x4_t) -> float32x4_t {
+    transmute(a)
 }
 
-/// Insert vector element from another vector element
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsetq_lane_f32<const LANE: i32>(a: f32, b: float32x4_t) -> float32x4_t {
-    static_assert_imm2!(LANE);
-    simd_insert(b, LANE as u32, a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_f32_s64(a: int64x2_t) -> float32x4_t {
+    transmute(a)
 }
 
-/// Signed Shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshl))]
-pub unsafe fn vshl_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshifts.v8i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sshl.v8i8")]
-        fn vshl_s8_(a: int8x8_t, b: int8x8_t) -> int8x8_t;
-    }
-vshl_s8_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_f32_u8(a: uint8x8_t) -> float32x2_t {
+    transmute(a)
 }
 
-/// Signed Shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshl))]
-pub unsafe fn vshlq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshifts.v16i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sshl.v16i8")]
-        fn vshlq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t;
-    }
-vshlq_s8_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_f32_u16(a: uint16x4_t) -> float32x2_t {
+    transmute(a)
 }
 
-/// Signed Shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshl))]
-pub unsafe fn vshl_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshifts.v4i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sshl.v4i16")]
-        fn vshl_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t;
-    }
-vshl_s16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_f32_u32(a: uint32x2_t) -> float32x2_t {
+    transmute(a)
 }
 
-/// Signed Shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshl))]
-pub unsafe fn vshlq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshifts.v8i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sshl.v8i16")]
-        fn vshlq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
-    }
-vshlq_s16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_f32_u64(a: uint64x1_t) -> float32x2_t {
+    transmute(a)
 }
 
-/// Signed Shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshl))]
-pub unsafe fn vshl_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshifts.v2i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sshl.v2i32")]
-        fn vshl_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t;
-    }
-vshl_s32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_f32_u8(a: uint8x16_t) -> float32x4_t {
+    transmute(a)
 }
 
-/// Signed Shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshl))]
-pub unsafe fn vshlq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshifts.v4i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sshl.v4i32")]
-        fn vshlq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
-    }
-vshlq_s32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_f32_u16(a: uint16x8_t) -> float32x4_t {
+    transmute(a)
 }
 
-/// Signed Shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshl))]
-pub unsafe fn vshl_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshifts.v1i64")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sshl.v1i64")]
-        fn vshl_s64_(a: int64x1_t, b: int64x1_t) -> int64x1_t;
-    }
-vshl_s64_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_f32_u32(a: uint32x4_t) -> float32x4_t {
+    transmute(a)
 }
 
-/// Signed Shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshl))]
-pub unsafe fn vshlq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshifts.v2i64")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sshl.v2i64")]
-        fn vshlq_s64_(a: int64x2_t, b: int64x2_t) -> int64x2_t;
-    }
-vshlq_s64_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_f32_u64(a: uint64x2_t) -> float32x4_t {
+    transmute(a)
 }
 
-/// Unsigned Shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushl))]
-pub unsafe fn vshl_u8(a: uint8x8_t, b: int8x8_t) -> uint8x8_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshiftu.v8i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ushl.v8i8")]
-        fn vshl_u8_(a: uint8x8_t, b: int8x8_t) -> uint8x8_t;
-    }
-vshl_u8_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_f32_p8(a: poly8x8_t) -> float32x2_t {
+    transmute(a)
 }
 
-/// Unsigned Shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushl))]
-pub unsafe fn vshlq_u8(a: uint8x16_t, b: int8x16_t) -> uint8x16_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshiftu.v16i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ushl.v16i8")]
-        fn vshlq_u8_(a: uint8x16_t, b: int8x16_t) -> uint8x16_t;
-    }
-vshlq_u8_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_f32_p16(a: poly16x4_t) -> float32x2_t {
+    transmute(a)
 }
 
-/// Unsigned Shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushl))]
-pub unsafe fn vshl_u16(a: uint16x4_t, b: int16x4_t) -> uint16x4_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_f32_p8(a: poly8x16_t) -> float32x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_f32_p16(a: poly16x8_t) -> float32x4_t {
+    transmute(a)
+}
+
+/// Signed rounding shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshl))]
+pub unsafe fn vrshl_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshiftu.v4i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ushl.v4i16")]
-        fn vshl_u16_(a: uint16x4_t, b: int16x4_t) -> uint16x4_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshifts.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srshl.v8i8")]
+        fn vrshl_s8_(a: int8x8_t, b: int8x8_t) -> int8x8_t;
     }
-vshl_u16_(a, b)
+vrshl_s8_(a, b)
 }
 
-/// Unsigned Shift left
+/// Signed rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushl))]
-pub unsafe fn vshlq_u16(a: uint16x8_t, b: int16x8_t) -> uint16x8_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshl))]
+pub unsafe fn vrshlq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshiftu.v8i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ushl.v8i16")]
-        fn vshlq_u16_(a: uint16x8_t, b: int16x8_t) -> uint16x8_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshifts.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srshl.v16i8")]
+        fn vrshlq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t;
     }
-vshlq_u16_(a, b)
+vrshlq_s8_(a, b)
 }
 
-/// Unsigned Shift left
+/// Signed rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushl))]
-pub unsafe fn vshl_u32(a: uint32x2_t, b: int32x2_t) -> uint32x2_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshl))]
+pub unsafe fn vrshl_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshiftu.v2i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ushl.v2i32")]
-        fn vshl_u32_(a: uint32x2_t, b: int32x2_t) -> uint32x2_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshifts.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srshl.v4i16")]
+        fn vrshl_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t;
     }
-vshl_u32_(a, b)
+vrshl_s16_(a, b)
 }
 
-/// Unsigned Shift left
+/// Signed rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushl))]
-pub unsafe fn vshlq_u32(a: uint32x4_t, b: int32x4_t) -> uint32x4_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshl))]
+pub unsafe fn vrshlq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshiftu.v4i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ushl.v4i32")]
-        fn vshlq_u32_(a: uint32x4_t, b: int32x4_t) -> uint32x4_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshifts.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srshl.v8i16")]
+        fn vrshlq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
     }
-vshlq_u32_(a, b)
+vrshlq_s16_(a, b)
 }
 
-/// Unsigned Shift left
+/// Signed rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushl))]
-pub unsafe fn vshl_u64(a: uint64x1_t, b: int64x1_t) -> uint64x1_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshl))]
+pub unsafe fn vrshl_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshiftu.v1i64")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ushl.v1i64")]
-        fn vshl_u64_(a: uint64x1_t, b: int64x1_t) -> uint64x1_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshifts.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srshl.v2i32")]
+        fn vrshl_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t;
     }
-vshl_u64_(a, b)
+vrshl_s32_(a, b)
 }
 
-/// Unsigned Shift left
+/// Signed rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushl))]
-pub unsafe fn vshlq_u64(a: uint64x2_t, b: int64x2_t) -> uint64x2_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshl))]
+pub unsafe fn vrshlq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshiftu.v2i64")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ushl.v2i64")]
-        fn vshlq_u64_(a: uint64x2_t, b: int64x2_t) -> uint64x2_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshifts.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srshl.v4i32")]
+        fn vrshlq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
     }
-vshlq_u64_(a, b)
+vrshlq_s32_(a, b)
 }
 
-/// Shift left
+/// Signed rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vshl_n_s8<const N: i32>(a: int8x8_t) -> int8x8_t {
-    static_assert_imm3!(N);
-    simd_shl(a, vdup_n_s8(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshl))]
+pub unsafe fn vrshl_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshifts.v1i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srshl.v1i64")]
+        fn vrshl_s64_(a: int64x1_t, b: int64x1_t) -> int64x1_t;
+    }
+vrshl_s64_(a, b)
 }
 
-/// Shift left
+/// Signed rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vshlq_n_s8<const N: i32>(a: int8x16_t) -> int8x16_t {
-    static_assert_imm3!(N);
-    simd_shl(a, vdupq_n_s8(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshl))]
+pub unsafe fn vrshlq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshifts.v2i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srshl.v2i64")]
+        fn vrshlq_s64_(a: int64x2_t, b: int64x2_t) -> int64x2_t;
+    }
+vrshlq_s64_(a, b)
 }
 
-/// Shift left
+/// Unsigned rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vshl_n_s16<const N: i32>(a: int16x4_t) -> int16x4_t {
-    static_assert_imm4!(N);
-    simd_shl(a, vdup_n_s16(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshl))]
+pub unsafe fn vrshl_u8(a: uint8x8_t, b: int8x8_t) -> uint8x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftu.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urshl.v8i8")]
+        fn vrshl_u8_(a: uint8x8_t, b: int8x8_t) -> uint8x8_t;
+    }
+vrshl_u8_(a, b)
 }
 
-/// Shift left
+/// Unsigned rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vshlq_n_s16<const N: i32>(a: int16x8_t) -> int16x8_t {
-    static_assert_imm4!(N);
-    simd_shl(a, vdupq_n_s16(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshl))]
+pub unsafe fn vrshlq_u8(a: uint8x16_t, b: int8x16_t) -> uint8x16_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftu.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urshl.v16i8")]
+        fn vrshlq_u8_(a: uint8x16_t, b: int8x16_t) -> uint8x16_t;
+    }
+vrshlq_u8_(a, b)
 }
 
-/// Shift left
+/// Unsigned rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vshl_n_s32<const N: i32>(a: int32x2_t) -> int32x2_t {
-    static_assert_imm5!(N);
-    simd_shl(a, vdup_n_s32(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshl))]
+pub unsafe fn vrshl_u16(a: uint16x4_t, b: int16x4_t) -> uint16x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftu.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urshl.v4i16")]
+        fn vrshl_u16_(a: uint16x4_t, b: int16x4_t) -> uint16x4_t;
+    }
+vrshl_u16_(a, b)
 }
 
-/// Shift left
+/// Unsigned rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vshlq_n_s32<const N: i32>(a: int32x4_t) -> int32x4_t {
-    static_assert_imm5!(N);
-    simd_shl(a, vdupq_n_s32(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshl))]
+pub unsafe fn vrshlq_u16(a: uint16x8_t, b: int16x8_t) -> uint16x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftu.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urshl.v8i16")]
+        fn vrshlq_u16_(a: uint16x8_t, b: int16x8_t) -> uint16x8_t;
+    }
+vrshlq_u16_(a, b)
 }
 
-/// Shift left
+/// Unsigned rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vshl_n_u8<const N: i32>(a: uint8x8_t) -> uint8x8_t {
-    static_assert_imm3!(N);
-    simd_shl(a, vdup_n_u8(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshl))]
+pub unsafe fn vrshl_u32(a: uint32x2_t, b: int32x2_t) -> uint32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftu.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urshl.v2i32")]
+        fn vrshl_u32_(a: uint32x2_t, b: int32x2_t) -> uint32x2_t;
+    }
+vrshl_u32_(a, b)
 }
 
-/// Shift left
+/// Unsigned rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vshlq_n_u8<const N: i32>(a: uint8x16_t) -> uint8x16_t {
-    static_assert_imm3!(N);
-    simd_shl(a, vdupq_n_u8(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshl))]
+pub unsafe fn vrshlq_u32(a: uint32x4_t, b: int32x4_t) -> uint32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftu.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urshl.v4i32")]
+        fn vrshlq_u32_(a: uint32x4_t, b: int32x4_t) -> uint32x4_t;
+    }
+vrshlq_u32_(a, b)
 }
 
-/// Shift left
+/// Unsigned rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vshl_n_u16<const N: i32>(a: uint16x4_t) -> uint16x4_t {
-    static_assert_imm4!(N);
-    simd_shl(a, vdup_n_u16(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshl))]
+pub unsafe fn vrshl_u64(a: uint64x1_t, b: int64x1_t) -> uint64x1_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftu.v1i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urshl.v1i64")]
+        fn vrshl_u64_(a: uint64x1_t, b: int64x1_t) -> uint64x1_t;
+    }
+vrshl_u64_(a, b)
 }
 
-/// Shift left
+/// Unsigned rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vshlq_n_u16<const N: i32>(a: uint16x8_t) -> uint16x8_t {
-    static_assert_imm4!(N);
-    simd_shl(a, vdupq_n_u16(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshl))]
+pub unsafe fn vrshlq_u64(a: uint64x2_t, b: int64x2_t) -> uint64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftu.v2i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urshl.v2i64")]
+        fn vrshlq_u64_(a: uint64x2_t, b: int64x2_t) -> uint64x2_t;
+    }
+vrshlq_u64_(a, b)
 }
 
-/// Shift left
+/// Signed rounding shift right
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshr, N = 2))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn vshl_n_u32<const N: i32>(a: uint32x2_t) -> uint32x2_t {
-    static_assert_imm5!(N);
-    simd_shl(a, vdup_n_u32(N.try_into().unwrap()))
+pub unsafe fn vrshr_n_s8<const N: i32>(a: int8x8_t) -> int8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    vrshl_s8(a, vdup_n_s8((-N).try_into().unwrap()))
 }
 
-/// Shift left
+/// Signed rounding shift right
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshr, N = 2))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn vshlq_n_u32<const N: i32>(a: uint32x4_t) -> uint32x4_t {
-    static_assert_imm5!(N);
-    simd_shl(a, vdupq_n_u32(N.try_into().unwrap()))
+pub unsafe fn vrshrq_n_s8<const N: i32>(a: int8x16_t) -> int8x16_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    vrshlq_s8(a, vdupq_n_s8((-N).try_into().unwrap()))
 }
 
-/// Shift left
+/// Signed rounding shift right
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshr, N = 2))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn vshl_n_s64<const N: i32>(a: int64x1_t) -> int64x1_t {
-    static_assert_imm6!(N);
-    simd_shl(a, vdup_n_s64(N.try_into().unwrap()))
+pub unsafe fn vrshr_n_s16<const N: i32>(a: int16x4_t) -> int16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    vrshl_s16(a, vdup_n_s16((-N).try_into().unwrap()))
 }
 
-/// Shift left
+/// Signed rounding shift right
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshr, N = 2))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn vshlq_n_s64<const N: i32>(a: int64x2_t) -> int64x2_t {
-    static_assert_imm6!(N);
-    simd_shl(a, vdupq_n_s64(N.try_into().unwrap()))
+pub unsafe fn vrshrq_n_s16<const N: i32>(a: int16x8_t) -> int16x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    vrshlq_s16(a, vdupq_n_s16((-N).try_into().unwrap()))
 }
 
-/// Shift left
+/// Signed rounding shift right
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshr, N = 2))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn vshl_n_u64<const N: i32>(a: uint64x1_t) -> uint64x1_t {
-    static_assert_imm6!(N);
-    simd_shl(a, vdup_n_u64(N.try_into().unwrap()))
+pub unsafe fn vrshr_n_s32<const N: i32>(a: int32x2_t) -> int32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    vrshl_s32(a, vdup_n_s32((-N).try_into().unwrap()))
 }
 
-/// Shift left
+/// Signed rounding shift right
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshr, N = 2))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn vshlq_n_u64<const N: i32>(a: uint64x2_t) -> uint64x2_t {
-    static_assert_imm6!(N);
-    simd_shl(a, vdupq_n_u64(N.try_into().unwrap()))
+pub unsafe fn vrshrq_n_s32<const N: i32>(a: int32x4_t) -> int32x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    vrshlq_s32(a, vdupq_n_s32((-N).try_into().unwrap()))
 }
 
-/// Signed shift left long
+/// Signed rounding shift right
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshll.s8", N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshll, N = 2))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshr, N = 2))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn vshll_n_s8<const N: i32>(a: int8x8_t) -> int16x8_t {
-    static_assert!(N : i32 where N >= 0 && N <= 8);
-    simd_shl(simd_cast(a), vdupq_n_s16(N.try_into().unwrap()))
+pub unsafe fn vrshr_n_s64<const N: i32>(a: int64x1_t) -> int64x1_t {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    vrshl_s64(a, vdup_n_s64((-N).try_into().unwrap()))
 }
 
-/// Signed shift left long
+/// Signed rounding shift right
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshll.s16", N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshll, N = 2))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshr, N = 2))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn vshll_n_s16<const N: i32>(a: int16x4_t) -> int32x4_t {
-    static_assert!(N : i32 where N >= 0 && N <= 16);
-    simd_shl(simd_cast(a), vdupq_n_s32(N.try_into().unwrap()))
+pub unsafe fn vrshrq_n_s64<const N: i32>(a: int64x2_t) -> int64x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    vrshlq_s64(a, vdupq_n_s64((-N).try_into().unwrap()))
 }
 
-/// Signed shift left long
+/// Unsigned rounding shift right
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshll.s32", N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshll, N = 2))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshr, N = 2))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn vshll_n_s32<const N: i32>(a: int32x2_t) -> int64x2_t {
-    static_assert!(N : i32 where N >= 0 && N <= 32);
-    simd_shl(simd_cast(a), vdupq_n_s64(N.try_into().unwrap()))
+pub unsafe fn vrshr_n_u8<const N: i32>(a: uint8x8_t) -> uint8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    vrshl_u8(a, vdup_n_s8((-N).try_into().unwrap()))
 }
 
-/// Signed shift left long
+/// Unsigned rounding shift right
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshll.u8", N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushll, N = 2))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshr, N = 2))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn vshll_n_u8<const N: i32>(a: uint8x8_t) -> uint16x8_t {
-    static_assert!(N : i32 where N >= 0 && N <= 8);
-    simd_shl(simd_cast(a), vdupq_n_u16(N.try_into().unwrap()))
+pub unsafe fn vrshrq_n_u8<const N: i32>(a: uint8x16_t) -> uint8x16_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    vrshlq_u8(a, vdupq_n_s8((-N).try_into().unwrap()))
 }
 
-/// Signed shift left long
+/// Unsigned rounding shift right
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshll.u16", N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushll, N = 2))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshr, N = 2))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn vshll_n_u16<const N: i32>(a: uint16x4_t) -> uint32x4_t {
-    static_assert!(N : i32 where N >= 0 && N <= 16);
-    simd_shl(simd_cast(a), vdupq_n_u32(N.try_into().unwrap()))
+pub unsafe fn vrshr_n_u16<const N: i32>(a: uint16x4_t) -> uint16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    vrshl_u16(a, vdup_n_s16((-N).try_into().unwrap()))
 }
 
-/// Signed shift left long
+/// Unsigned rounding shift right
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshll.u32", N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushll, N = 2))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshr, N = 2))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn vshll_n_u32<const N: i32>(a: uint32x2_t) -> uint64x2_t {
-    static_assert!(N : i32 where N >= 0 && N <= 32);
-    simd_shl(simd_cast(a), vdupq_n_u64(N.try_into().unwrap()))
+pub unsafe fn vrshrq_n_u16<const N: i32>(a: uint16x8_t) -> uint16x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    vrshlq_u16(a, vdupq_n_s16((-N).try_into().unwrap()))
 }
 
-/// Shift right
+/// Unsigned rounding shift right
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.s8", N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshr, N = 2))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshr, N = 2))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn vshr_n_s8<const N: i32>(a: int8x8_t) -> int8x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
-    simd_shr(a, vdup_n_s8(N.try_into().unwrap()))
+pub unsafe fn vrshr_n_u32<const N: i32>(a: uint32x2_t) -> uint32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    vrshl_u32(a, vdup_n_s32((-N).try_into().unwrap()))
 }
 
-/// Shift right
+/// Unsigned rounding shift right
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.s8", N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshr, N = 2))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshr, N = 2))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn vshrq_n_s8<const N: i32>(a: int8x16_t) -> int8x16_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
-    simd_shr(a, vdupq_n_s8(N.try_into().unwrap()))
+pub unsafe fn vrshrq_n_u32<const N: i32>(a: uint32x4_t) -> uint32x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    vrshlq_u32(a, vdupq_n_s32((-N).try_into().unwrap()))
 }
 
-/// Shift right
+/// Unsigned rounding shift right
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.s16", N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshr, N = 2))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshr, N = 2))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn vshr_n_s16<const N: i32>(a: int16x4_t) -> int16x4_t {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
-    simd_shr(a, vdup_n_s16(N.try_into().unwrap()))
+pub unsafe fn vrshr_n_u64<const N: i32>(a: uint64x1_t) -> uint64x1_t {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    vrshl_u64(a, vdup_n_s64((-N).try_into().unwrap()))
 }
 
-/// Shift right
+/// Unsigned rounding shift right
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.s16", N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshr, N = 2))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshr, N = 2))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn vshrq_n_s16<const N: i32>(a: int16x8_t) -> int16x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
-    simd_shr(a, vdupq_n_s16(N.try_into().unwrap()))
+pub unsafe fn vrshrq_n_u64<const N: i32>(a: uint64x2_t) -> uint64x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    vrshlq_u64(a, vdupq_n_s64((-N).try_into().unwrap()))
 }
 
-/// Shift right
+/// Rounding shift right narrow
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.s32", N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshr, N = 2))]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshrn, N = 2))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn vshr_n_s32<const N: i32>(a: int32x2_t) -> int32x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
-    simd_shr(a, vdup_n_s32(N.try_into().unwrap()))
-}
-
-/// Shift right
-#[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.s32", N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshr, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vshrq_n_s32<const N: i32>(a: int32x4_t) -> int32x4_t {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
-    simd_shr(a, vdupq_n_s32(N.try_into().unwrap()))
+pub unsafe fn vrshrn_n_s16<const N: i32>(a: int16x8_t) -> int8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftn.v8i8")]
+        fn vrshrn_n_s16_(a: int16x8_t, n: int16x8_t) -> int8x8_t;
+    }
+vrshrn_n_s16_(a, int16x8_t(-N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16))
 }
 
-/// Shift right
+/// Rounding shift right narrow
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.s64", N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshr, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rshrn, N = 2))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn vshr_n_s64<const N: i32>(a: int64x1_t) -> int64x1_t {
-    static_assert!(N : i32 where N >= 1 && N <= 64);
-    simd_shr(a, vdup_n_s64(N.try_into().unwrap()))
+pub unsafe fn vrshrn_n_s16<const N: i32>(a: int16x8_t) -> int8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.rshrn.v8i8")]
+        fn vrshrn_n_s16_(a: int16x8_t, n: i32) -> int8x8_t;
+    }
+vrshrn_n_s16_(a, N)
 }
 
-/// Shift right
+/// Rounding shift right narrow
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.s64", N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshr, N = 2))]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshrn, N = 2))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn vshrq_n_s64<const N: i32>(a: int64x2_t) -> int64x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 64);
-    simd_shr(a, vdupq_n_s64(N.try_into().unwrap()))
+pub unsafe fn vrshrn_n_s32<const N: i32>(a: int32x4_t) -> int16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftn.v4i16")]
+        fn vrshrn_n_s32_(a: int32x4_t, n: int32x4_t) -> int16x4_t;
+    }
+vrshrn_n_s32_(a, int32x4_t(-N as i32, -N as i32, -N as i32, -N as i32))
 }
 
-/// Shift right
+/// Rounding shift right narrow
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.u8", N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushr, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rshrn, N = 2))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn vshr_n_u8<const N: i32>(a: uint8x8_t) -> uint8x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
-    simd_shr(a, vdup_n_u8(N.try_into().unwrap()))
+pub unsafe fn vrshrn_n_s32<const N: i32>(a: int32x4_t) -> int16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.rshrn.v4i16")]
+        fn vrshrn_n_s32_(a: int32x4_t, n: i32) -> int16x4_t;
+    }
+vrshrn_n_s32_(a, N)
 }
 
-/// Shift right
+/// Rounding shift right narrow
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.u8", N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushr, N = 2))]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshrn, N = 2))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn vshrq_n_u8<const N: i32>(a: uint8x16_t) -> uint8x16_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
-    simd_shr(a, vdupq_n_u8(N.try_into().unwrap()))
+pub unsafe fn vrshrn_n_s64<const N: i32>(a: int64x2_t) -> int32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftn.v2i32")]
+        fn vrshrn_n_s64_(a: int64x2_t, n: int64x2_t) -> int32x2_t;
+    }
+vrshrn_n_s64_(a, int64x2_t(-N as i64, -N as i64))
 }
 
-/// Shift right
+/// Rounding shift right narrow
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.u16", N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushr, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rshrn, N = 2))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn vshr_n_u16<const N: i32>(a: uint16x4_t) -> uint16x4_t {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
-    simd_shr(a, vdup_n_u16(N.try_into().unwrap()))
+pub unsafe fn vrshrn_n_s64<const N: i32>(a: int64x2_t) -> int32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.rshrn.v2i32")]
+        fn vrshrn_n_s64_(a: int64x2_t, n: i32) -> int32x2_t;
+    }
+vrshrn_n_s64_(a, N)
 }
 
-/// Shift right
+/// Rounding shift right narrow
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.u16", N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushr, N = 2))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshrn, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rshrn, N = 2))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn vshrq_n_u16<const N: i32>(a: uint16x8_t) -> uint16x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
-    simd_shr(a, vdupq_n_u16(N.try_into().unwrap()))
+pub unsafe fn vrshrn_n_u16<const N: i32>(a: uint16x8_t) -> uint8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    transmute(vrshrn_n_s16::<N>(transmute(a)))
 }
 
-/// Shift right
+/// Rounding shift right narrow
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.u32", N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushr, N = 2))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshrn, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rshrn, N = 2))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn vshr_n_u32<const N: i32>(a: uint32x2_t) -> uint32x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
-    simd_shr(a, vdup_n_u32(N.try_into().unwrap()))
+pub unsafe fn vrshrn_n_u32<const N: i32>(a: uint32x4_t) -> uint16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    transmute(vrshrn_n_s32::<N>(transmute(a)))
 }
 
-/// Shift right
+/// Rounding shift right narrow
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.u32", N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushr, N = 2))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshrn, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rshrn, N = 2))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn vshrq_n_u32<const N: i32>(a: uint32x4_t) -> uint32x4_t {
+pub unsafe fn vrshrn_n_u64<const N: i32>(a: uint64x2_t) -> uint32x2_t {
     static_assert!(N : i32 where N >= 1 && N <= 32);
-    simd_shr(a, vdupq_n_u32(N.try_into().unwrap()))
+    transmute(vrshrn_n_s64::<N>(transmute(a)))
 }
 
-/// Shift right
+/// Signed rounding shift right and accumulate
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.u64", N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushr, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vshr_n_u64<const N: i32>(a: uint64x1_t) -> uint64x1_t {
-    static_assert!(N : i32 where N >= 1 && N <= 64);
-    simd_shr(a, vdup_n_u64(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srsra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vrsra_n_s8<const N: i32>(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    simd_add(a, vrshr_n_s8::<N>(b))
 }
 
-/// Shift right
+/// Signed rounding shift right and accumulate
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.u64", N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushr, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vshrq_n_u64<const N: i32>(a: uint64x2_t) -> uint64x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 64);
-    simd_shr(a, vdupq_n_u64(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srsra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vrsraq_n_s8<const N: i32>(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    simd_add(a, vrshrq_n_s8::<N>(b))
 }
 
-/// Shift right narrow
+/// Signed rounding shift right and accumulate
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshrn.i16", N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vshrn_n_s16<const N: i32>(a: int16x8_t) -> int8x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
-    simd_cast(simd_shr(a, vdupq_n_s16(N.try_into().unwrap())))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srsra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vrsra_n_s16<const N: i32>(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    simd_add(a, vrshr_n_s16::<N>(b))
 }
 
-/// Shift right narrow
+/// Signed rounding shift right and accumulate
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshrn.i32", N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vshrn_n_s32<const N: i32>(a: int32x4_t) -> int16x4_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srsra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vrsraq_n_s16<const N: i32>(a: int16x8_t, b: int16x8_t) -> int16x8_t {
     static_assert!(N : i32 where N >= 1 && N <= 16);
-    simd_cast(simd_shr(a, vdupq_n_s32(N.try_into().unwrap())))
+    simd_add(a, vrshrq_n_s16::<N>(b))
 }
 
-/// Shift right narrow
+/// Signed rounding shift right and accumulate
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshrn.i64", N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vshrn_n_s64<const N: i32>(a: int64x2_t) -> int32x2_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srsra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vrsra_n_s32<const N: i32>(a: int32x2_t, b: int32x2_t) -> int32x2_t {
     static_assert!(N : i32 where N >= 1 && N <= 32);
-    simd_cast(simd_shr(a, vdupq_n_s64(N.try_into().unwrap())))
+    simd_add(a, vrshr_n_s32::<N>(b))
 }
 
-/// Shift right narrow
+/// Signed rounding shift right and accumulate
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshrn.i16", N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vshrn_n_u16<const N: i32>(a: uint16x8_t) -> uint8x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
-    simd_cast(simd_shr(a, vdupq_n_u16(N.try_into().unwrap())))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srsra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vrsraq_n_s32<const N: i32>(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    simd_add(a, vrshrq_n_s32::<N>(b))
 }
 
-/// Shift right narrow
+/// Signed rounding shift right and accumulate
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshrn.i32", N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vshrn_n_u32<const N: i32>(a: uint32x4_t) -> uint16x4_t {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
-    simd_cast(simd_shr(a, vdupq_n_u32(N.try_into().unwrap())))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srsra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vrsra_n_s64<const N: i32>(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    simd_add(a, vrshr_n_s64::<N>(b))
 }
 
-/// Shift right narrow
+/// Signed rounding shift right and accumulate
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshrn.i64", N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vshrn_n_u64<const N: i32>(a: uint64x2_t) -> uint32x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
-    simd_cast(simd_shr(a, vdupq_n_u64(N.try_into().unwrap())))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srsra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vrsraq_n_s64<const N: i32>(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    simd_add(a, vrshrq_n_s64::<N>(b))
 }
 
-/// Signed shift right and accumulate
+/// Unsigned rounding shift right and accumulate
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssra, N = 2))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ursra, N = 2))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn vsra_n_s8<const N: i32>(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+pub unsafe fn vrsra_n_u8<const N: i32>(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
     static_assert!(N : i32 where N >= 1 && N <= 8);
-    simd_add(a, vshr_n_s8::<N>(b))
+    simd_add(a, vrshr_n_u8::<N>(b))
 }
 
-/// Signed shift right and accumulate
+/// Unsigned rounding shift right and accumulate
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssra, N = 2))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ursra, N = 2))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn vsraq_n_s8<const N: i32>(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+pub unsafe fn vrsraq_n_u8<const N: i32>(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
     static_assert!(N : i32 where N >= 1 && N <= 8);
-    simd_add(a, vshrq_n_s8::<N>(b))
+    simd_add(a, vrshrq_n_u8::<N>(b))
 }
 
-/// Signed shift right and accumulate
+/// Unsigned rounding shift right and accumulate
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssra, N = 2))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ursra, N = 2))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn vsra_n_s16<const N: i32>(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+pub unsafe fn vrsra_n_u16<const N: i32>(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
     static_assert!(N : i32 where N >= 1 && N <= 16);
-    simd_add(a, vshr_n_s16::<N>(b))
+    simd_add(a, vrshr_n_u16::<N>(b))
 }
 
-/// Signed shift right and accumulate
+/// Unsigned rounding shift right and accumulate
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssra, N = 2))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ursra, N = 2))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn vsraq_n_s16<const N: i32>(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+pub unsafe fn vrsraq_n_u16<const N: i32>(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
     static_assert!(N : i32 where N >= 1 && N <= 16);
-    simd_add(a, vshrq_n_s16::<N>(b))
+    simd_add(a, vrshrq_n_u16::<N>(b))
 }
 
-/// Signed shift right and accumulate
+/// Unsigned rounding shift right and accumulate
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssra, N = 2))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ursra, N = 2))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn vsra_n_s32<const N: i32>(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+pub unsafe fn vrsra_n_u32<const N: i32>(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
     static_assert!(N : i32 where N >= 1 && N <= 32);
-    simd_add(a, vshr_n_s32::<N>(b))
+    simd_add(a, vrshr_n_u32::<N>(b))
 }
 
-/// Signed shift right and accumulate
+/// Unsigned rounding shift right and accumulate
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssra, N = 2))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ursra, N = 2))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn vsraq_n_s32<const N: i32>(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+pub unsafe fn vrsraq_n_u32<const N: i32>(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
     static_assert!(N : i32 where N >= 1 && N <= 32);
-    simd_add(a, vshrq_n_s32::<N>(b))
+    simd_add(a, vrshrq_n_u32::<N>(b))
 }
 
-/// Signed shift right and accumulate
+/// Unsigned rounding shift right and accumulate
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssra, N = 2))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ursra, N = 2))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn vsra_n_s64<const N: i32>(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+pub unsafe fn vrsra_n_u64<const N: i32>(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
     static_assert!(N : i32 where N >= 1 && N <= 64);
-    simd_add(a, vshr_n_s64::<N>(b))
+    simd_add(a, vrshr_n_u64::<N>(b))
 }
 
-/// Signed shift right and accumulate
+/// Unsigned rounding shift right and accumulate
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssra, N = 2))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ursra, N = 2))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn vsraq_n_s64<const N: i32>(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+pub unsafe fn vrsraq_n_u64<const N: i32>(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
     static_assert!(N : i32 where N >= 1 && N <= 64);
-    simd_add(a, vshrq_n_s64::<N>(b))
+    simd_add(a, vrshrq_n_u64::<N>(b))
 }
 
-/// Unsigned shift right and accumulate
+/// Insert vector element from another vector element
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usra, N = 2))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn vsra_n_u8<const N: i32>(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
-    simd_add(a, vshr_n_u8::<N>(b))
+pub unsafe fn vset_lane_s8<const LANE: i32>(a: i8, b: int8x8_t) -> int8x8_t {
+    static_assert_imm3!(LANE);
+    simd_insert(b, LANE as u32, a)
 }
 
-/// Unsigned shift right and accumulate
+/// Insert vector element from another vector element
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usra, N = 2))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn vsraq_n_u8<const N: i32>(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
-    simd_add(a, vshrq_n_u8::<N>(b))
+pub unsafe fn vset_lane_s16<const LANE: i32>(a: i16, b: int16x4_t) -> int16x4_t {
+    static_assert_imm2!(LANE);
+    simd_insert(b, LANE as u32, a)
 }
 
-/// Unsigned shift right and accumulate
+/// Insert vector element from another vector element
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usra, N = 2))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn vsra_n_u16<const N: i32>(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
-    simd_add(a, vshr_n_u16::<N>(b))
+pub unsafe fn vset_lane_s32<const LANE: i32>(a: i32, b: int32x2_t) -> int32x2_t {
+    static_assert_imm1!(LANE);
+    simd_insert(b, LANE as u32, a)
 }
 
-/// Unsigned shift right and accumulate
+/// Insert vector element from another vector element
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usra, N = 2))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn vsraq_n_u16<const N: i32>(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
-    simd_add(a, vshrq_n_u16::<N>(b))
+pub unsafe fn vset_lane_s64<const LANE: i32>(a: i64, b: int64x1_t) -> int64x1_t {
+    static_assert!(LANE : i32 where LANE == 0);
+    simd_insert(b, LANE as u32, a)
 }
 
-/// Unsigned shift right and accumulate
+/// Insert vector element from another vector element
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usra, N = 2))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn vsra_n_u32<const N: i32>(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
-    simd_add(a, vshr_n_u32::<N>(b))
+pub unsafe fn vset_lane_u8<const LANE: i32>(a: u8, b: uint8x8_t) -> uint8x8_t {
+    static_assert_imm3!(LANE);
+    simd_insert(b, LANE as u32, a)
 }
 
-/// Unsigned shift right and accumulate
+/// Insert vector element from another vector element
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usra, N = 2))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn vsraq_n_u32<const N: i32>(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
-    simd_add(a, vshrq_n_u32::<N>(b))
+pub unsafe fn vset_lane_u16<const LANE: i32>(a: u16, b: uint16x4_t) -> uint16x4_t {
+    static_assert_imm2!(LANE);
+    simd_insert(b, LANE as u32, a)
 }
 
-/// Unsigned shift right and accumulate
+/// Insert vector element from another vector element
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usra, N = 2))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn vsra_n_u64<const N: i32>(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
-    static_assert!(N : i32 where N >= 1 && N <= 64);
-    simd_add(a, vshr_n_u64::<N>(b))
+pub unsafe fn vset_lane_u32<const LANE: i32>(a: u32, b: uint32x2_t) -> uint32x2_t {
+    static_assert_imm1!(LANE);
+    simd_insert(b, LANE as u32, a)
 }
 
-/// Unsigned shift right and accumulate
+/// Insert vector element from another vector element
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usra, N = 2))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn vsraq_n_u64<const N: i32>(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 64);
-    simd_add(a, vshrq_n_u64::<N>(b))
+pub unsafe fn vset_lane_u64<const LANE: i32>(a: u64, b: uint64x1_t) -> uint64x1_t {
+    static_assert!(LANE : i32 where LANE == 0);
+    simd_insert(b, LANE as u32, a)
 }
 
-/// Unsigned Absolute difference and Accumulate Long
+/// Insert vector element from another vector element
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabal.u8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uabal))]
-pub unsafe fn vabal_u8(a: uint16x8_t, b: uint8x8_t, c: uint8x8_t) -> uint16x8_t {
-    let d: uint8x8_t = vabd_u8(b, c);
-    simd_add(a, simd_cast(d))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vset_lane_p8<const LANE: i32>(a: p8, b: poly8x8_t) -> poly8x8_t {
+    static_assert_imm3!(LANE);
+    simd_insert(b, LANE as u32, a)
 }
 
-/// Unsigned Absolute difference and Accumulate Long
+/// Insert vector element from another vector element
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabal.u16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uabal))]
-pub unsafe fn vabal_u16(a: uint32x4_t, b: uint16x4_t, c: uint16x4_t) -> uint32x4_t {
-    let d: uint16x4_t = vabd_u16(b, c);
-    simd_add(a, simd_cast(d))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vset_lane_p16<const LANE: i32>(a: p16, b: poly16x4_t) -> poly16x4_t {
+    static_assert_imm2!(LANE);
+    simd_insert(b, LANE as u32, a)
 }
 
-/// Unsigned Absolute difference and Accumulate Long
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "crypto,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vset_lane_p64<const LANE: i32>(a: p64, b: poly64x1_t) -> poly64x1_t {
+    static_assert!(LANE : i32 where LANE == 0);
+    simd_insert(b, LANE as u32, a)
+}
+
+/// Insert vector element from another vector element
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabal.u32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uabal))]
-pub unsafe fn vabal_u32(a: uint64x2_t, b: uint32x2_t, c: uint32x2_t) -> uint64x2_t {
-    let d: uint32x2_t = vabd_u32(b, c);
-    simd_add(a, simd_cast(d))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsetq_lane_s8<const LANE: i32>(a: i8, b: int8x16_t) -> int8x16_t {
+    static_assert_imm4!(LANE);
+    simd_insert(b, LANE as u32, a)
 }
 
-/// Signed Absolute difference and Accumulate Long
+/// Insert vector element from another vector element
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabal.s8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sabal))]
-pub unsafe fn vabal_s8(a: int16x8_t, b: int8x8_t, c: int8x8_t) -> int16x8_t {
-    let d: int8x8_t = vabd_s8(b, c);
-    let e: uint8x8_t = simd_cast(d);
-    simd_add(a, simd_cast(e))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsetq_lane_s16<const LANE: i32>(a: i16, b: int16x8_t) -> int16x8_t {
+    static_assert_imm3!(LANE);
+    simd_insert(b, LANE as u32, a)
 }
 
-/// Signed Absolute difference and Accumulate Long
+/// Insert vector element from another vector element
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabal.s16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sabal))]
-pub unsafe fn vabal_s16(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t {
-    let d: int16x4_t = vabd_s16(b, c);
-    let e: uint16x4_t = simd_cast(d);
-    simd_add(a, simd_cast(e))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsetq_lane_s32<const LANE: i32>(a: i32, b: int32x4_t) -> int32x4_t {
+    static_assert_imm2!(LANE);
+    simd_insert(b, LANE as u32, a)
 }
 
-/// Signed Absolute difference and Accumulate Long
+/// Insert vector element from another vector element
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabal.s32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sabal))]
-pub unsafe fn vabal_s32(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t {
-    let d: int32x2_t = vabd_s32(b, c);
-    let e: uint32x2_t = simd_cast(d);
-    simd_add(a, simd_cast(e))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsetq_lane_s64<const LANE: i32>(a: i64, b: int64x2_t) -> int64x2_t {
+    static_assert_imm1!(LANE);
+    simd_insert(b, LANE as u32, a)
 }
 
-/// Singned saturating Absolute value
+/// Insert vector element from another vector element
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqabs.s8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqabs))]
-pub unsafe fn vqabs_s8(a: int8x8_t) -> int8x8_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsetq_lane_u8<const LANE: i32>(a: u8, b: uint8x16_t) -> uint8x16_t {
+    static_assert_imm4!(LANE);
+    simd_insert(b, LANE as u32, a)
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsetq_lane_u16<const LANE: i32>(a: u16, b: uint16x8_t) -> uint16x8_t {
+    static_assert_imm3!(LANE);
+    simd_insert(b, LANE as u32, a)
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsetq_lane_u32<const LANE: i32>(a: u32, b: uint32x4_t) -> uint32x4_t {
+    static_assert_imm2!(LANE);
+    simd_insert(b, LANE as u32, a)
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsetq_lane_u64<const LANE: i32>(a: u64, b: uint64x2_t) -> uint64x2_t {
+    static_assert_imm1!(LANE);
+    simd_insert(b, LANE as u32, a)
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsetq_lane_p8<const LANE: i32>(a: p8, b: poly8x16_t) -> poly8x16_t {
+    static_assert_imm4!(LANE);
+    simd_insert(b, LANE as u32, a)
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsetq_lane_p16<const LANE: i32>(a: p16, b: poly16x8_t) -> poly16x8_t {
+    static_assert_imm3!(LANE);
+    simd_insert(b, LANE as u32, a)
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "crypto,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsetq_lane_p64<const LANE: i32>(a: p64, b: poly64x2_t) -> poly64x2_t {
+    static_assert_imm1!(LANE);
+    simd_insert(b, LANE as u32, a)
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vset_lane_f32<const LANE: i32>(a: f32, b: float32x2_t) -> float32x2_t {
+    static_assert_imm1!(LANE);
+    simd_insert(b, LANE as u32, a)
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsetq_lane_f32<const LANE: i32>(a: f32, b: float32x4_t) -> float32x4_t {
+    static_assert_imm2!(LANE);
+    simd_insert(b, LANE as u32, a)
+}
+
+/// Signed Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshl))]
+pub unsafe fn vshl_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqabs.v8i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqabs.v8i8")]
-        fn vqabs_s8_(a: int8x8_t) -> int8x8_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshifts.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sshl.v8i8")]
+        fn vshl_s8_(a: int8x8_t, b: int8x8_t) -> int8x8_t;
     }
-vqabs_s8_(a)
+vshl_s8_(a, b)
 }
 
-/// Singned saturating Absolute value
+/// Signed Shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqabs.s8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqabs))]
-pub unsafe fn vqabsq_s8(a: int8x16_t) -> int8x16_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshl))]
+pub unsafe fn vshlq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqabs.v16i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqabs.v16i8")]
-        fn vqabsq_s8_(a: int8x16_t) -> int8x16_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshifts.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sshl.v16i8")]
+        fn vshlq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t;
     }
-vqabsq_s8_(a)
+vshlq_s8_(a, b)
 }
 
-/// Singned saturating Absolute value
+/// Signed Shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqabs.s16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqabs))]
-pub unsafe fn vqabs_s16(a: int16x4_t) -> int16x4_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshl))]
+pub unsafe fn vshl_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqabs.v4i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqabs.v4i16")]
-        fn vqabs_s16_(a: int16x4_t) -> int16x4_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshifts.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sshl.v4i16")]
+        fn vshl_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t;
     }
-vqabs_s16_(a)
+vshl_s16_(a, b)
 }
 
-/// Singned saturating Absolute value
+/// Signed Shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqabs.s16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqabs))]
-pub unsafe fn vqabsq_s16(a: int16x8_t) -> int16x8_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshl))]
+pub unsafe fn vshlq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqabs.v8i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqabs.v8i16")]
-        fn vqabsq_s16_(a: int16x8_t) -> int16x8_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshifts.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sshl.v8i16")]
+        fn vshlq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
     }
-vqabsq_s16_(a)
+vshlq_s16_(a, b)
 }
 
-/// Singned saturating Absolute value
+/// Signed Shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqabs.s32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqabs))]
-pub unsafe fn vqabs_s32(a: int32x2_t) -> int32x2_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshl))]
+pub unsafe fn vshl_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqabs.v2i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqabs.v2i32")]
-        fn vqabs_s32_(a: int32x2_t) -> int32x2_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshifts.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sshl.v2i32")]
+        fn vshl_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t;
     }
-vqabs_s32_(a)
+vshl_s32_(a, b)
 }
 
-/// Singned saturating Absolute value
+/// Signed Shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqabs.s32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqabs))]
-pub unsafe fn vqabsq_s32(a: int32x4_t) -> int32x4_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshl))]
+pub unsafe fn vshlq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
     #[allow(improper_ctypes)]
     extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqabs.v4i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqabs.v4i32")]
-        fn vqabsq_s32_(a: int32x4_t) -> int32x4_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshifts.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sshl.v4i32")]
+        fn vshlq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
     }
-vqabsq_s32_(a)
+vshlq_s32_(a, b)
 }
 
-#[cfg(test)]
-#[allow(overflowing_literals)]
-mod test {
-    use super::*;
-    use crate::core_arch::simd::*;
-    use std::mem::transmute;
-    use stdarch_test::simd_test;
-
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vand_s8() {
-        let a: i8x8 = i8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let b: i8x8 = i8x8::new(0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F);
-        let e: i8x8 = i8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let r: i8x8 = transmute(vand_s8(transmute(a), transmute(b)));
-        assert_eq!(r, e);
-
-        let a: i8x8 = i8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let b: i8x8 = i8x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
-        let e: i8x8 = i8x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
-        let r: i8x8 = transmute(vand_s8(transmute(a), transmute(b)));
-        assert_eq!(r, e);
+/// Signed Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshl))]
+pub unsafe fn vshl_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshifts.v1i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sshl.v1i64")]
+        fn vshl_s64_(a: int64x1_t, b: int64x1_t) -> int64x1_t;
     }
+vshl_s64_(a, b)
+}
 
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vandq_s8() {
-        let a: i8x16 = i8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x00);
-        let b: i8x16 = i8x16::new(0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F);
-        let e: i8x16 = i8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x00);
-        let r: i8x16 = transmute(vandq_s8(transmute(a), transmute(b)));
+/// Signed Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshl))]
+pub unsafe fn vshlq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshifts.v2i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sshl.v2i64")]
+        fn vshlq_s64_(a: int64x2_t, b: int64x2_t) -> int64x2_t;
+    }
+vshlq_s64_(a, b)
+}
+
+/// Unsigned Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushl))]
+pub unsafe fn vshl_u8(a: uint8x8_t, b: int8x8_t) -> uint8x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshiftu.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ushl.v8i8")]
+        fn vshl_u8_(a: uint8x8_t, b: int8x8_t) -> uint8x8_t;
+    }
+vshl_u8_(a, b)
+}
+
+/// Unsigned Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushl))]
+pub unsafe fn vshlq_u8(a: uint8x16_t, b: int8x16_t) -> uint8x16_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshiftu.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ushl.v16i8")]
+        fn vshlq_u8_(a: uint8x16_t, b: int8x16_t) -> uint8x16_t;
+    }
+vshlq_u8_(a, b)
+}
+
+/// Unsigned Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushl))]
+pub unsafe fn vshl_u16(a: uint16x4_t, b: int16x4_t) -> uint16x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshiftu.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ushl.v4i16")]
+        fn vshl_u16_(a: uint16x4_t, b: int16x4_t) -> uint16x4_t;
+    }
+vshl_u16_(a, b)
+}
+
+/// Unsigned Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushl))]
+pub unsafe fn vshlq_u16(a: uint16x8_t, b: int16x8_t) -> uint16x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshiftu.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ushl.v8i16")]
+        fn vshlq_u16_(a: uint16x8_t, b: int16x8_t) -> uint16x8_t;
+    }
+vshlq_u16_(a, b)
+}
+
+/// Unsigned Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushl))]
+pub unsafe fn vshl_u32(a: uint32x2_t, b: int32x2_t) -> uint32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshiftu.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ushl.v2i32")]
+        fn vshl_u32_(a: uint32x2_t, b: int32x2_t) -> uint32x2_t;
+    }
+vshl_u32_(a, b)
+}
+
+/// Unsigned Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushl))]
+pub unsafe fn vshlq_u32(a: uint32x4_t, b: int32x4_t) -> uint32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshiftu.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ushl.v4i32")]
+        fn vshlq_u32_(a: uint32x4_t, b: int32x4_t) -> uint32x4_t;
+    }
+vshlq_u32_(a, b)
+}
+
+/// Unsigned Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushl))]
+pub unsafe fn vshl_u64(a: uint64x1_t, b: int64x1_t) -> uint64x1_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshiftu.v1i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ushl.v1i64")]
+        fn vshl_u64_(a: uint64x1_t, b: int64x1_t) -> uint64x1_t;
+    }
+vshl_u64_(a, b)
+}
+
+/// Unsigned Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushl))]
+pub unsafe fn vshlq_u64(a: uint64x2_t, b: int64x2_t) -> uint64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshiftu.v2i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ushl.v2i64")]
+        fn vshlq_u64_(a: uint64x2_t, b: int64x2_t) -> uint64x2_t;
+    }
+vshlq_u64_(a, b)
+}
+
+/// Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshl_n_s8<const N: i32>(a: int8x8_t) -> int8x8_t {
+    static_assert_imm3!(N);
+    simd_shl(a, vdup_n_s8(N.try_into().unwrap()))
+}
+
+/// Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshlq_n_s8<const N: i32>(a: int8x16_t) -> int8x16_t {
+    static_assert_imm3!(N);
+    simd_shl(a, vdupq_n_s8(N.try_into().unwrap()))
+}
+
+/// Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshl_n_s16<const N: i32>(a: int16x4_t) -> int16x4_t {
+    static_assert_imm4!(N);
+    simd_shl(a, vdup_n_s16(N.try_into().unwrap()))
+}
+
+/// Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshlq_n_s16<const N: i32>(a: int16x8_t) -> int16x8_t {
+    static_assert_imm4!(N);
+    simd_shl(a, vdupq_n_s16(N.try_into().unwrap()))
+}
+
+/// Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshl_n_s32<const N: i32>(a: int32x2_t) -> int32x2_t {
+    static_assert_imm5!(N);
+    simd_shl(a, vdup_n_s32(N.try_into().unwrap()))
+}
+
+/// Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshlq_n_s32<const N: i32>(a: int32x4_t) -> int32x4_t {
+    static_assert_imm5!(N);
+    simd_shl(a, vdupq_n_s32(N.try_into().unwrap()))
+}
+
+/// Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshl_n_u8<const N: i32>(a: uint8x8_t) -> uint8x8_t {
+    static_assert_imm3!(N);
+    simd_shl(a, vdup_n_u8(N.try_into().unwrap()))
+}
+
+/// Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshlq_n_u8<const N: i32>(a: uint8x16_t) -> uint8x16_t {
+    static_assert_imm3!(N);
+    simd_shl(a, vdupq_n_u8(N.try_into().unwrap()))
+}
+
+/// Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshl_n_u16<const N: i32>(a: uint16x4_t) -> uint16x4_t {
+    static_assert_imm4!(N);
+    simd_shl(a, vdup_n_u16(N.try_into().unwrap()))
+}
+
+/// Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshlq_n_u16<const N: i32>(a: uint16x8_t) -> uint16x8_t {
+    static_assert_imm4!(N);
+    simd_shl(a, vdupq_n_u16(N.try_into().unwrap()))
+}
+
+/// Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshl_n_u32<const N: i32>(a: uint32x2_t) -> uint32x2_t {
+    static_assert_imm5!(N);
+    simd_shl(a, vdup_n_u32(N.try_into().unwrap()))
+}
+
+/// Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshlq_n_u32<const N: i32>(a: uint32x4_t) -> uint32x4_t {
+    static_assert_imm5!(N);
+    simd_shl(a, vdupq_n_u32(N.try_into().unwrap()))
+}
+
+/// Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshl_n_s64<const N: i32>(a: int64x1_t) -> int64x1_t {
+    static_assert_imm6!(N);
+    simd_shl(a, vdup_n_s64(N.try_into().unwrap()))
+}
+
+/// Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshlq_n_s64<const N: i32>(a: int64x2_t) -> int64x2_t {
+    static_assert_imm6!(N);
+    simd_shl(a, vdupq_n_s64(N.try_into().unwrap()))
+}
+
+/// Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshl_n_u64<const N: i32>(a: uint64x1_t) -> uint64x1_t {
+    static_assert_imm6!(N);
+    simd_shl(a, vdup_n_u64(N.try_into().unwrap()))
+}
+
+/// Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshlq_n_u64<const N: i32>(a: uint64x2_t) -> uint64x2_t {
+    static_assert_imm6!(N);
+    simd_shl(a, vdupq_n_u64(N.try_into().unwrap()))
+}
+
+/// Signed shift left long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshll.s8", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshll, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshll_n_s8<const N: i32>(a: int8x8_t) -> int16x8_t {
+    static_assert!(N : i32 where N >= 0 && N <= 8);
+    simd_shl(simd_cast(a), vdupq_n_s16(N.try_into().unwrap()))
+}
+
+/// Signed shift left long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshll.s16", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshll, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshll_n_s16<const N: i32>(a: int16x4_t) -> int32x4_t {
+    static_assert!(N : i32 where N >= 0 && N <= 16);
+    simd_shl(simd_cast(a), vdupq_n_s32(N.try_into().unwrap()))
+}
+
+/// Signed shift left long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshll.s32", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshll, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshll_n_s32<const N: i32>(a: int32x2_t) -> int64x2_t {
+    static_assert!(N : i32 where N >= 0 && N <= 32);
+    simd_shl(simd_cast(a), vdupq_n_s64(N.try_into().unwrap()))
+}
+
+/// Signed shift left long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshll.u8", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushll, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshll_n_u8<const N: i32>(a: uint8x8_t) -> uint16x8_t {
+    static_assert!(N : i32 where N >= 0 && N <= 8);
+    simd_shl(simd_cast(a), vdupq_n_u16(N.try_into().unwrap()))
+}
+
+/// Signed shift left long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshll.u16", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushll, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshll_n_u16<const N: i32>(a: uint16x4_t) -> uint32x4_t {
+    static_assert!(N : i32 where N >= 0 && N <= 16);
+    simd_shl(simd_cast(a), vdupq_n_u32(N.try_into().unwrap()))
+}
+
+/// Signed shift left long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshll.u32", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushll, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshll_n_u32<const N: i32>(a: uint32x2_t) -> uint64x2_t {
+    static_assert!(N : i32 where N >= 0 && N <= 32);
+    simd_shl(simd_cast(a), vdupq_n_u64(N.try_into().unwrap()))
+}
+
+/// Shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.s8", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshr_n_s8<const N: i32>(a: int8x8_t) -> int8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    simd_shr(a, vdup_n_s8(N.try_into().unwrap()))
+}
+
+/// Shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.s8", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshrq_n_s8<const N: i32>(a: int8x16_t) -> int8x16_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    simd_shr(a, vdupq_n_s8(N.try_into().unwrap()))
+}
+
+/// Shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.s16", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshr_n_s16<const N: i32>(a: int16x4_t) -> int16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    simd_shr(a, vdup_n_s16(N.try_into().unwrap()))
+}
+
+/// Shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.s16", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshrq_n_s16<const N: i32>(a: int16x8_t) -> int16x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    simd_shr(a, vdupq_n_s16(N.try_into().unwrap()))
+}
+
+/// Shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.s32", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshr_n_s32<const N: i32>(a: int32x2_t) -> int32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    simd_shr(a, vdup_n_s32(N.try_into().unwrap()))
+}
+
+/// Shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.s32", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshrq_n_s32<const N: i32>(a: int32x4_t) -> int32x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    simd_shr(a, vdupq_n_s32(N.try_into().unwrap()))
+}
+
+/// Shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.s64", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshr_n_s64<const N: i32>(a: int64x1_t) -> int64x1_t {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    simd_shr(a, vdup_n_s64(N.try_into().unwrap()))
+}
+
+/// Shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.s64", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshrq_n_s64<const N: i32>(a: int64x2_t) -> int64x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    simd_shr(a, vdupq_n_s64(N.try_into().unwrap()))
+}
+
+/// Shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.u8", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshr_n_u8<const N: i32>(a: uint8x8_t) -> uint8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    simd_shr(a, vdup_n_u8(N.try_into().unwrap()))
+}
+
+/// Shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.u8", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshrq_n_u8<const N: i32>(a: uint8x16_t) -> uint8x16_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    simd_shr(a, vdupq_n_u8(N.try_into().unwrap()))
+}
+
+/// Shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.u16", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshr_n_u16<const N: i32>(a: uint16x4_t) -> uint16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    simd_shr(a, vdup_n_u16(N.try_into().unwrap()))
+}
+
+/// Shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.u16", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshrq_n_u16<const N: i32>(a: uint16x8_t) -> uint16x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    simd_shr(a, vdupq_n_u16(N.try_into().unwrap()))
+}
+
+/// Shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.u32", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshr_n_u32<const N: i32>(a: uint32x2_t) -> uint32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    simd_shr(a, vdup_n_u32(N.try_into().unwrap()))
+}
+
+/// Shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.u32", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshrq_n_u32<const N: i32>(a: uint32x4_t) -> uint32x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    simd_shr(a, vdupq_n_u32(N.try_into().unwrap()))
+}
+
+/// Shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.u64", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshr_n_u64<const N: i32>(a: uint64x1_t) -> uint64x1_t {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    simd_shr(a, vdup_n_u64(N.try_into().unwrap()))
+}
+
+/// Shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.u64", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshrq_n_u64<const N: i32>(a: uint64x2_t) -> uint64x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    simd_shr(a, vdupq_n_u64(N.try_into().unwrap()))
+}
+
+/// Shift right narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshrn.i16", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshrn_n_s16<const N: i32>(a: int16x8_t) -> int8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    simd_cast(simd_shr(a, vdupq_n_s16(N.try_into().unwrap())))
+}
+
+/// Shift right narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshrn.i32", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshrn_n_s32<const N: i32>(a: int32x4_t) -> int16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    simd_cast(simd_shr(a, vdupq_n_s32(N.try_into().unwrap())))
+}
+
+/// Shift right narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshrn.i64", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshrn_n_s64<const N: i32>(a: int64x2_t) -> int32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    simd_cast(simd_shr(a, vdupq_n_s64(N.try_into().unwrap())))
+}
+
+/// Shift right narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshrn.i16", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshrn_n_u16<const N: i32>(a: uint16x8_t) -> uint8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    simd_cast(simd_shr(a, vdupq_n_u16(N.try_into().unwrap())))
+}
+
+/// Shift right narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshrn.i32", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshrn_n_u32<const N: i32>(a: uint32x4_t) -> uint16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    simd_cast(simd_shr(a, vdupq_n_u32(N.try_into().unwrap())))
+}
+
+/// Shift right narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshrn.i64", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshrn_n_u64<const N: i32>(a: uint64x2_t) -> uint32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    simd_cast(simd_shr(a, vdupq_n_u64(N.try_into().unwrap())))
+}
+
+/// Signed shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsra_n_s8<const N: i32>(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    simd_add(a, vshr_n_s8::<N>(b))
+}
+
+/// Signed shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsraq_n_s8<const N: i32>(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    simd_add(a, vshrq_n_s8::<N>(b))
+}
+
+/// Signed shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsra_n_s16<const N: i32>(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    simd_add(a, vshr_n_s16::<N>(b))
+}
+
+/// Signed shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsraq_n_s16<const N: i32>(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    simd_add(a, vshrq_n_s16::<N>(b))
+}
+
+/// Signed shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsra_n_s32<const N: i32>(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    simd_add(a, vshr_n_s32::<N>(b))
+}
+
+/// Signed shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsraq_n_s32<const N: i32>(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    simd_add(a, vshrq_n_s32::<N>(b))
+}
+
+/// Signed shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsra_n_s64<const N: i32>(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    simd_add(a, vshr_n_s64::<N>(b))
+}
+
+/// Signed shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsraq_n_s64<const N: i32>(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    simd_add(a, vshrq_n_s64::<N>(b))
+}
+
+/// Unsigned shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsra_n_u8<const N: i32>(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    simd_add(a, vshr_n_u8::<N>(b))
+}
+
+/// Unsigned shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsraq_n_u8<const N: i32>(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    simd_add(a, vshrq_n_u8::<N>(b))
+}
+
+/// Unsigned shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsra_n_u16<const N: i32>(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    simd_add(a, vshr_n_u16::<N>(b))
+}
+
+/// Unsigned shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsraq_n_u16<const N: i32>(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    simd_add(a, vshrq_n_u16::<N>(b))
+}
+
+/// Unsigned shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsra_n_u32<const N: i32>(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    simd_add(a, vshr_n_u32::<N>(b))
+}
+
+/// Unsigned shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsraq_n_u32<const N: i32>(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    simd_add(a, vshrq_n_u32::<N>(b))
+}
+
+/// Unsigned shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsra_n_u64<const N: i32>(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    simd_add(a, vshr_n_u64::<N>(b))
+}
+
+/// Unsigned shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsraq_n_u64<const N: i32>(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    simd_add(a, vshrq_n_u64::<N>(b))
+}
+
+/// Unsigned Absolute difference and Accumulate Long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabal.u8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uabal))]
+pub unsafe fn vabal_u8(a: uint16x8_t, b: uint8x8_t, c: uint8x8_t) -> uint16x8_t {
+    let d: uint8x8_t = vabd_u8(b, c);
+    simd_add(a, simd_cast(d))
+}
+
+/// Unsigned Absolute difference and Accumulate Long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabal.u16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uabal))]
+pub unsafe fn vabal_u16(a: uint32x4_t, b: uint16x4_t, c: uint16x4_t) -> uint32x4_t {
+    let d: uint16x4_t = vabd_u16(b, c);
+    simd_add(a, simd_cast(d))
+}
+
+/// Unsigned Absolute difference and Accumulate Long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabal.u32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uabal))]
+pub unsafe fn vabal_u32(a: uint64x2_t, b: uint32x2_t, c: uint32x2_t) -> uint64x2_t {
+    let d: uint32x2_t = vabd_u32(b, c);
+    simd_add(a, simd_cast(d))
+}
+
+/// Signed Absolute difference and Accumulate Long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabal.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sabal))]
+pub unsafe fn vabal_s8(a: int16x8_t, b: int8x8_t, c: int8x8_t) -> int16x8_t {
+    let d: int8x8_t = vabd_s8(b, c);
+    let e: uint8x8_t = simd_cast(d);
+    simd_add(a, simd_cast(e))
+}
+
+/// Signed Absolute difference and Accumulate Long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabal.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sabal))]
+pub unsafe fn vabal_s16(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t {
+    let d: int16x4_t = vabd_s16(b, c);
+    let e: uint16x4_t = simd_cast(d);
+    simd_add(a, simd_cast(e))
+}
+
+/// Signed Absolute difference and Accumulate Long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabal.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sabal))]
+pub unsafe fn vabal_s32(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t {
+    let d: int32x2_t = vabd_s32(b, c);
+    let e: uint32x2_t = simd_cast(d);
+    simd_add(a, simd_cast(e))
+}
+
+/// Singned saturating Absolute value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqabs.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqabs))]
+pub unsafe fn vqabs_s8(a: int8x8_t) -> int8x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqabs.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqabs.v8i8")]
+        fn vqabs_s8_(a: int8x8_t) -> int8x8_t;
+    }
+vqabs_s8_(a)
+}
+
+/// Singned saturating Absolute value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqabs.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqabs))]
+pub unsafe fn vqabsq_s8(a: int8x16_t) -> int8x16_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqabs.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqabs.v16i8")]
+        fn vqabsq_s8_(a: int8x16_t) -> int8x16_t;
+    }
+vqabsq_s8_(a)
+}
+
+/// Singned saturating Absolute value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqabs.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqabs))]
+pub unsafe fn vqabs_s16(a: int16x4_t) -> int16x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqabs.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqabs.v4i16")]
+        fn vqabs_s16_(a: int16x4_t) -> int16x4_t;
+    }
+vqabs_s16_(a)
+}
+
+/// Singned saturating Absolute value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqabs.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqabs))]
+pub unsafe fn vqabsq_s16(a: int16x8_t) -> int16x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqabs.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqabs.v8i16")]
+        fn vqabsq_s16_(a: int16x8_t) -> int16x8_t;
+    }
+vqabsq_s16_(a)
+}
+
+/// Singned saturating Absolute value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqabs.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqabs))]
+pub unsafe fn vqabs_s32(a: int32x2_t) -> int32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqabs.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqabs.v2i32")]
+        fn vqabs_s32_(a: int32x2_t) -> int32x2_t;
+    }
+vqabs_s32_(a)
+}
+
+/// Singned saturating Absolute value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqabs.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqabs))]
+pub unsafe fn vqabsq_s32(a: int32x4_t) -> int32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqabs.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqabs.v4i32")]
+        fn vqabsq_s32_(a: int32x4_t) -> int32x4_t;
+    }
+vqabsq_s32_(a)
+}
+
+#[cfg(test)]
+#[allow(overflowing_literals)]
+mod test {
+    use super::*;
+    use crate::core_arch::simd::*;
+    use std::mem::transmute;
+    use stdarch_test::simd_test;
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vand_s8() {
+        let a: i8x8 = i8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: i8x8 = i8x8::new(0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F);
+        let e: i8x8 = i8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let r: i8x8 = transmute(vand_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: i8x8 = i8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: i8x8 = i8x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: i8x8 = i8x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let r: i8x8 = transmute(vand_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vandq_s8() {
+        let a: i8x16 = i8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x00);
+        let b: i8x16 = i8x16::new(0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F);
+        let e: i8x16 = i8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x00);
+        let r: i8x16 = transmute(vandq_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: i8x16 = i8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x00);
+        let b: i8x16 = i8x16::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: i8x16 = i8x16::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let r: i8x16 = transmute(vandq_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vand_s16() {
+        let a: i16x4 = i16x4::new(0x00, 0x01, 0x02, 0x03);
+        let b: i16x4 = i16x4::new(0x0F, 0x0F, 0x0F, 0x0F);
+        let e: i16x4 = i16x4::new(0x00, 0x01, 0x02, 0x03);
+        let r: i16x4 = transmute(vand_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: i16x4 = i16x4::new(0x00, 0x01, 0x02, 0x03);
+        let b: i16x4 = i16x4::new(0x00, 0x00, 0x00, 0x00);
+        let e: i16x4 = i16x4::new(0x00, 0x00, 0x00, 0x00);
+        let r: i16x4 = transmute(vand_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vandq_s16() {
+        let a: i16x8 = i16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: i16x8 = i16x8::new(0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F);
+        let e: i16x8 = i16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let r: i16x8 = transmute(vandq_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: i16x8 = i16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: i16x8 = i16x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: i16x8 = i16x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let r: i16x8 = transmute(vandq_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vand_s32() {
+        let a: i32x2 = i32x2::new(0x00, 0x01);
+        let b: i32x2 = i32x2::new(0x0F, 0x0F);
+        let e: i32x2 = i32x2::new(0x00, 0x01);
+        let r: i32x2 = transmute(vand_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: i32x2 = i32x2::new(0x00, 0x01);
+        let b: i32x2 = i32x2::new(0x00, 0x00);
+        let e: i32x2 = i32x2::new(0x00, 0x00);
+        let r: i32x2 = transmute(vand_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vandq_s32() {
+        let a: i32x4 = i32x4::new(0x00, 0x01, 0x02, 0x03);
+        let b: i32x4 = i32x4::new(0x0F, 0x0F, 0x0F, 0x0F);
+        let e: i32x4 = i32x4::new(0x00, 0x01, 0x02, 0x03);
+        let r: i32x4 = transmute(vandq_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: i32x4 = i32x4::new(0x00, 0x01, 0x02, 0x03);
+        let b: i32x4 = i32x4::new(0x00, 0x00, 0x00, 0x00);
+        let e: i32x4 = i32x4::new(0x00, 0x00, 0x00, 0x00);
+        let r: i32x4 = transmute(vandq_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vand_u8() {
+        let a: u8x8 = u8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: u8x8 = u8x8::new(0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F);
+        let e: u8x8 = u8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let r: u8x8 = transmute(vand_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: u8x8 = u8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: u8x8 = u8x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: u8x8 = u8x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let r: u8x8 = transmute(vand_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vandq_u8() {
+        let a: u8x16 = u8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x00);
+        let b: u8x16 = u8x16::new(0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F);
+        let e: u8x16 = u8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x00);
+        let r: u8x16 = transmute(vandq_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: u8x16 = u8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x00);
+        let b: u8x16 = u8x16::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: u8x16 = u8x16::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let r: u8x16 = transmute(vandq_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vand_u16() {
+        let a: u16x4 = u16x4::new(0x00, 0x01, 0x02, 0x03);
+        let b: u16x4 = u16x4::new(0x0F, 0x0F, 0x0F, 0x0F);
+        let e: u16x4 = u16x4::new(0x00, 0x01, 0x02, 0x03);
+        let r: u16x4 = transmute(vand_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: u16x4 = u16x4::new(0x00, 0x01, 0x02, 0x03);
+        let b: u16x4 = u16x4::new(0x00, 0x00, 0x00, 0x00);
+        let e: u16x4 = u16x4::new(0x00, 0x00, 0x00, 0x00);
+        let r: u16x4 = transmute(vand_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vandq_u16() {
+        let a: u16x8 = u16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: u16x8 = u16x8::new(0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F);
+        let e: u16x8 = u16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let r: u16x8 = transmute(vandq_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: u16x8 = u16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: u16x8 = u16x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: u16x8 = u16x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let r: u16x8 = transmute(vandq_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vand_u32() {
+        let a: u32x2 = u32x2::new(0x00, 0x01);
+        let b: u32x2 = u32x2::new(0x0F, 0x0F);
+        let e: u32x2 = u32x2::new(0x00, 0x01);
+        let r: u32x2 = transmute(vand_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: u32x2 = u32x2::new(0x00, 0x01);
+        let b: u32x2 = u32x2::new(0x00, 0x00);
+        let e: u32x2 = u32x2::new(0x00, 0x00);
+        let r: u32x2 = transmute(vand_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vandq_u32() {
+        let a: u32x4 = u32x4::new(0x00, 0x01, 0x02, 0x03);
+        let b: u32x4 = u32x4::new(0x0F, 0x0F, 0x0F, 0x0F);
+        let e: u32x4 = u32x4::new(0x00, 0x01, 0x02, 0x03);
+        let r: u32x4 = transmute(vandq_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: u32x4 = u32x4::new(0x00, 0x01, 0x02, 0x03);
+        let b: u32x4 = u32x4::new(0x00, 0x00, 0x00, 0x00);
+        let e: u32x4 = u32x4::new(0x00, 0x00, 0x00, 0x00);
+        let r: u32x4 = transmute(vandq_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vand_s64() {
+        let a: i64x1 = i64x1::new(0x00);
+        let b: i64x1 = i64x1::new(0x0F);
+        let e: i64x1 = i64x1::new(0x00);
+        let r: i64x1 = transmute(vand_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: i64x1 = i64x1::new(0x00);
+        let b: i64x1 = i64x1::new(0x00);
+        let e: i64x1 = i64x1::new(0x00);
+        let r: i64x1 = transmute(vand_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vandq_s64() {
+        let a: i64x2 = i64x2::new(0x00, 0x01);
+        let b: i64x2 = i64x2::new(0x0F, 0x0F);
+        let e: i64x2 = i64x2::new(0x00, 0x01);
+        let r: i64x2 = transmute(vandq_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: i64x2 = i64x2::new(0x00, 0x01);
+        let b: i64x2 = i64x2::new(0x00, 0x00);
+        let e: i64x2 = i64x2::new(0x00, 0x00);
+        let r: i64x2 = transmute(vandq_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vand_u64() {
+        let a: u64x1 = u64x1::new(0x00);
+        let b: u64x1 = u64x1::new(0x0F);
+        let e: u64x1 = u64x1::new(0x00);
+        let r: u64x1 = transmute(vand_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: u64x1 = u64x1::new(0x00);
+        let b: u64x1 = u64x1::new(0x00);
+        let e: u64x1 = u64x1::new(0x00);
+        let r: u64x1 = transmute(vand_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vandq_u64() {
+        let a: u64x2 = u64x2::new(0x00, 0x01);
+        let b: u64x2 = u64x2::new(0x0F, 0x0F);
+        let e: u64x2 = u64x2::new(0x00, 0x01);
+        let r: u64x2 = transmute(vandq_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: u64x2 = u64x2::new(0x00, 0x01);
+        let b: u64x2 = u64x2::new(0x00, 0x00);
+        let e: u64x2 = u64x2::new(0x00, 0x00);
+        let r: u64x2 = transmute(vandq_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorr_s8() {
+        let a: i8x8 = i8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: i8x8 = i8x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: i8x8 = i8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let r: i8x8 = transmute(vorr_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorrq_s8() {
+        let a: i8x16 = i8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
+        let b: i8x16 = i8x16::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: i8x16 = i8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
+        let r: i8x16 = transmute(vorrq_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorr_s16() {
+        let a: i16x4 = i16x4::new(0x00, 0x01, 0x02, 0x03);
+        let b: i16x4 = i16x4::new(0x00, 0x00, 0x00, 0x00);
+        let e: i16x4 = i16x4::new(0x00, 0x01, 0x02, 0x03);
+        let r: i16x4 = transmute(vorr_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorrq_s16() {
+        let a: i16x8 = i16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: i16x8 = i16x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: i16x8 = i16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let r: i16x8 = transmute(vorrq_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorr_s32() {
+        let a: i32x2 = i32x2::new(0x00, 0x01);
+        let b: i32x2 = i32x2::new(0x00, 0x00);
+        let e: i32x2 = i32x2::new(0x00, 0x01);
+        let r: i32x2 = transmute(vorr_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorrq_s32() {
+        let a: i32x4 = i32x4::new(0x00, 0x01, 0x02, 0x03);
+        let b: i32x4 = i32x4::new(0x00, 0x00, 0x00, 0x00);
+        let e: i32x4 = i32x4::new(0x00, 0x01, 0x02, 0x03);
+        let r: i32x4 = transmute(vorrq_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorr_u8() {
+        let a: u8x8 = u8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: u8x8 = u8x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: u8x8 = u8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let r: u8x8 = transmute(vorr_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorrq_u8() {
+        let a: u8x16 = u8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
+        let b: u8x16 = u8x16::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: u8x16 = u8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
+        let r: u8x16 = transmute(vorrq_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorr_u16() {
+        let a: u16x4 = u16x4::new(0x00, 0x01, 0x02, 0x03);
+        let b: u16x4 = u16x4::new(0x00, 0x00, 0x00, 0x00);
+        let e: u16x4 = u16x4::new(0x00, 0x01, 0x02, 0x03);
+        let r: u16x4 = transmute(vorr_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorrq_u16() {
+        let a: u16x8 = u16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: u16x8 = u16x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: u16x8 = u16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let r: u16x8 = transmute(vorrq_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorr_u32() {
+        let a: u32x2 = u32x2::new(0x00, 0x01);
+        let b: u32x2 = u32x2::new(0x00, 0x00);
+        let e: u32x2 = u32x2::new(0x00, 0x01);
+        let r: u32x2 = transmute(vorr_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorrq_u32() {
+        let a: u32x4 = u32x4::new(0x00, 0x01, 0x02, 0x03);
+        let b: u32x4 = u32x4::new(0x00, 0x00, 0x00, 0x00);
+        let e: u32x4 = u32x4::new(0x00, 0x01, 0x02, 0x03);
+        let r: u32x4 = transmute(vorrq_u32(transmute(a), transmute(b)));
         assert_eq!(r, e);
+    }
 
-        let a: i8x16 = i8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x00);
-        let b: i8x16 = i8x16::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
-        let e: i8x16 = i8x16::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
-        let r: i8x16 = transmute(vandq_s8(transmute(a), transmute(b)));
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorr_s64() {
+        let a: i64x1 = i64x1::new(0x00);
+        let b: i64x1 = i64x1::new(0x00);
+        let e: i64x1 = i64x1::new(0x00);
+        let r: i64x1 = transmute(vorr_s64(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vand_s16() {
-        let a: i16x4 = i16x4::new(0x00, 0x01, 0x02, 0x03);
-        let b: i16x4 = i16x4::new(0x0F, 0x0F, 0x0F, 0x0F);
-        let e: i16x4 = i16x4::new(0x00, 0x01, 0x02, 0x03);
-        let r: i16x4 = transmute(vand_s16(transmute(a), transmute(b)));
+    unsafe fn test_vorrq_s64() {
+        let a: i64x2 = i64x2::new(0x00, 0x01);
+        let b: i64x2 = i64x2::new(0x00, 0x00);
+        let e: i64x2 = i64x2::new(0x00, 0x01);
+        let r: i64x2 = transmute(vorrq_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorr_u64() {
+        let a: u64x1 = u64x1::new(0x00);
+        let b: u64x1 = u64x1::new(0x00);
+        let e: u64x1 = u64x1::new(0x00);
+        let r: u64x1 = transmute(vorr_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorrq_u64() {
+        let a: u64x2 = u64x2::new(0x00, 0x01);
+        let b: u64x2 = u64x2::new(0x00, 0x00);
+        let e: u64x2 = u64x2::new(0x00, 0x01);
+        let r: u64x2 = transmute(vorrq_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veor_s8() {
+        let a: i8x8 = i8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: i8x8 = i8x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: i8x8 = i8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let r: i8x8 = transmute(veor_s8(transmute(a), transmute(b)));
         assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veorq_s8() {
+        let a: i8x16 = i8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
+        let b: i8x16 = i8x16::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: i8x16 = i8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
+        let r: i8x16 = transmute(veorq_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
 
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veor_s16() {
         let a: i16x4 = i16x4::new(0x00, 0x01, 0x02, 0x03);
         let b: i16x4 = i16x4::new(0x00, 0x00, 0x00, 0x00);
-        let e: i16x4 = i16x4::new(0x00, 0x00, 0x00, 0x00);
-        let r: i16x4 = transmute(vand_s16(transmute(a), transmute(b)));
+        let e: i16x4 = i16x4::new(0x00, 0x01, 0x02, 0x03);
+        let r: i16x4 = transmute(veor_s16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vandq_s16() {
+    unsafe fn test_veorq_s16() {
         let a: i16x8 = i16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let b: i16x8 = i16x8::new(0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F);
+        let b: i16x8 = i16x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
         let e: i16x8 = i16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let r: i16x8 = transmute(vandq_s16(transmute(a), transmute(b)));
+        let r: i16x8 = transmute(veorq_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veor_s32() {
+        let a: i32x2 = i32x2::new(0x00, 0x01);
+        let b: i32x2 = i32x2::new(0x00, 0x00);
+        let e: i32x2 = i32x2::new(0x00, 0x01);
+        let r: i32x2 = transmute(veor_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veorq_s32() {
+        let a: i32x4 = i32x4::new(0x00, 0x01, 0x02, 0x03);
+        let b: i32x4 = i32x4::new(0x00, 0x00, 0x00, 0x00);
+        let e: i32x4 = i32x4::new(0x00, 0x01, 0x02, 0x03);
+        let r: i32x4 = transmute(veorq_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veor_u8() {
+        let a: u8x8 = u8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: u8x8 = u8x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: u8x8 = u8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let r: u8x8 = transmute(veor_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veorq_u8() {
+        let a: u8x16 = u8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
+        let b: u8x16 = u8x16::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: u8x16 = u8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
+        let r: u8x16 = transmute(veorq_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veor_u16() {
+        let a: u16x4 = u16x4::new(0x00, 0x01, 0x02, 0x03);
+        let b: u16x4 = u16x4::new(0x00, 0x00, 0x00, 0x00);
+        let e: u16x4 = u16x4::new(0x00, 0x01, 0x02, 0x03);
+        let r: u16x4 = transmute(veor_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veorq_u16() {
+        let a: u16x8 = u16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: u16x8 = u16x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: u16x8 = u16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let r: u16x8 = transmute(veorq_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veor_u32() {
+        let a: u32x2 = u32x2::new(0x00, 0x01);
+        let b: u32x2 = u32x2::new(0x00, 0x00);
+        let e: u32x2 = u32x2::new(0x00, 0x01);
+        let r: u32x2 = transmute(veor_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veorq_u32() {
+        let a: u32x4 = u32x4::new(0x00, 0x01, 0x02, 0x03);
+        let b: u32x4 = u32x4::new(0x00, 0x00, 0x00, 0x00);
+        let e: u32x4 = u32x4::new(0x00, 0x01, 0x02, 0x03);
+        let r: u32x4 = transmute(veorq_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veor_s64() {
+        let a: i64x1 = i64x1::new(0x00);
+        let b: i64x1 = i64x1::new(0x00);
+        let e: i64x1 = i64x1::new(0x00);
+        let r: i64x1 = transmute(veor_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veorq_s64() {
+        let a: i64x2 = i64x2::new(0x00, 0x01);
+        let b: i64x2 = i64x2::new(0x00, 0x00);
+        let e: i64x2 = i64x2::new(0x00, 0x01);
+        let r: i64x2 = transmute(veorq_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veor_u64() {
+        let a: u64x1 = u64x1::new(0x00);
+        let b: u64x1 = u64x1::new(0x00);
+        let e: u64x1 = u64x1::new(0x00);
+        let r: u64x1 = transmute(veor_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veorq_u64() {
+        let a: u64x2 = u64x2::new(0x00, 0x01);
+        let b: u64x2 = u64x2::new(0x00, 0x00);
+        let e: u64x2 = u64x2::new(0x00, 0x01);
+        let r: u64x2 = transmute(veorq_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabd_s8() {
+        let a: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: i8x8 = i8x8::new(16, 15, 14, 13, 12, 11, 10, 9);
+        let e: i8x8 = i8x8::new(15, 13, 11, 9, 7, 5, 3, 1);
+        let r: i8x8 = transmute(vabd_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabdq_s8() {
+        let a: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b: i8x16 = i8x16::new(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+        let e: i8x16 = i8x16::new(15, 13, 11, 9, 7, 5, 3, 1, 1, 3, 5, 7, 9, 11, 13, 15);
+        let r: i8x16 = transmute(vabdq_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabd_s16() {
+        let a: i16x4 = i16x4::new(1, 2, 3, 4);
+        let b: i16x4 = i16x4::new(16, 15, 14, 13);
+        let e: i16x4 = i16x4::new(15, 13, 11, 9);
+        let r: i16x4 = transmute(vabd_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabdq_s16() {
+        let a: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: i16x8 = i16x8::new(16, 15, 14, 13, 12, 11, 10, 9);
+        let e: i16x8 = i16x8::new(15, 13, 11, 9, 7, 5, 3, 1);
+        let r: i16x8 = transmute(vabdq_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabd_s32() {
+        let a: i32x2 = i32x2::new(1, 2);
+        let b: i32x2 = i32x2::new(16, 15);
+        let e: i32x2 = i32x2::new(15, 13);
+        let r: i32x2 = transmute(vabd_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabdq_s32() {
+        let a: i32x4 = i32x4::new(1, 2, 3, 4);
+        let b: i32x4 = i32x4::new(16, 15, 14, 13);
+        let e: i32x4 = i32x4::new(15, 13, 11, 9);
+        let r: i32x4 = transmute(vabdq_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabd_u8() {
+        let a: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: u8x8 = u8x8::new(16, 15, 14, 13, 12, 11, 10, 9);
+        let e: u8x8 = u8x8::new(15, 13, 11, 9, 7, 5, 3, 1);
+        let r: u8x8 = transmute(vabd_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabdq_u8() {
+        let a: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b: u8x16 = u8x16::new(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+        let e: u8x16 = u8x16::new(15, 13, 11, 9, 7, 5, 3, 1, 1, 3, 5, 7, 9, 11, 13, 15);
+        let r: u8x16 = transmute(vabdq_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabd_u16() {
+        let a: u16x4 = u16x4::new(1, 2, 3, 4);
+        let b: u16x4 = u16x4::new(16, 15, 14, 13);
+        let e: u16x4 = u16x4::new(15, 13, 11, 9);
+        let r: u16x4 = transmute(vabd_u16(transmute(a), transmute(b)));
         assert_eq!(r, e);
+    }
 
-        let a: i16x8 = i16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let b: i16x8 = i16x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
-        let e: i16x8 = i16x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
-        let r: i16x8 = transmute(vandq_s16(transmute(a), transmute(b)));
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabdq_u16() {
+        let a: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: u16x8 = u16x8::new(16, 15, 14, 13, 12, 11, 10, 9);
+        let e: u16x8 = u16x8::new(15, 13, 11, 9, 7, 5, 3, 1);
+        let r: u16x8 = transmute(vabdq_u16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vand_s32() {
-        let a: i32x2 = i32x2::new(0x00, 0x01);
-        let b: i32x2 = i32x2::new(0x0F, 0x0F);
-        let e: i32x2 = i32x2::new(0x00, 0x01);
-        let r: i32x2 = transmute(vand_s32(transmute(a), transmute(b)));
+    unsafe fn test_vabd_u32() {
+        let a: u32x2 = u32x2::new(1, 2);
+        let b: u32x2 = u32x2::new(16, 15);
+        let e: u32x2 = u32x2::new(15, 13);
+        let r: u32x2 = transmute(vabd_u32(transmute(a), transmute(b)));
         assert_eq!(r, e);
+    }
 
-        let a: i32x2 = i32x2::new(0x00, 0x01);
-        let b: i32x2 = i32x2::new(0x00, 0x00);
-        let e: i32x2 = i32x2::new(0x00, 0x00);
-        let r: i32x2 = transmute(vand_s32(transmute(a), transmute(b)));
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabdq_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 3, 4);
+        let b: u32x4 = u32x4::new(16, 15, 14, 13);
+        let e: u32x4 = u32x4::new(15, 13, 11, 9);
+        let r: u32x4 = transmute(vabdq_u32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vandq_s32() {
-        let a: i32x4 = i32x4::new(0x00, 0x01, 0x02, 0x03);
-        let b: i32x4 = i32x4::new(0x0F, 0x0F, 0x0F, 0x0F);
-        let e: i32x4 = i32x4::new(0x00, 0x01, 0x02, 0x03);
-        let r: i32x4 = transmute(vandq_s32(transmute(a), transmute(b)));
+    unsafe fn test_vabd_f32() {
+        let a: f32x2 = f32x2::new(1.0, 2.0);
+        let b: f32x2 = f32x2::new(9.0, 3.0);
+        let e: f32x2 = f32x2::new(8.0, 1.0);
+        let r: f32x2 = transmute(vabd_f32(transmute(a), transmute(b)));
         assert_eq!(r, e);
+    }
 
-        let a: i32x4 = i32x4::new(0x00, 0x01, 0x02, 0x03);
-        let b: i32x4 = i32x4::new(0x00, 0x00, 0x00, 0x00);
-        let e: i32x4 = i32x4::new(0x00, 0x00, 0x00, 0x00);
-        let r: i32x4 = transmute(vandq_s32(transmute(a), transmute(b)));
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabdq_f32() {
+        let a: f32x4 = f32x4::new(1.0, 2.0, 5.0, -4.0);
+        let b: f32x4 = f32x4::new(9.0, 3.0, 2.0, 8.0);
+        let e: f32x4 = f32x4::new(8.0, 1.0, 3.0, 12.0);
+        let r: f32x4 = transmute(vabdq_f32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vand_u8() {
-        let a: u8x8 = u8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let b: u8x8 = u8x8::new(0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F);
-        let e: u8x8 = u8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let r: u8x8 = transmute(vand_u8(transmute(a), transmute(b)));
+    unsafe fn test_vabdl_u8() {
+        let a: u8x8 = u8x8::new(1, 2, 3, 4, 4, 3, 2, 1);
+        let b: u8x8 = u8x8::new(10, 10, 10, 10, 10, 10, 10, 10);
+        let e: u16x8 = u16x8::new(9, 8, 7, 6, 6, 7, 8, 9);
+        let r: u16x8 = transmute(vabdl_u8(transmute(a), transmute(b)));
         assert_eq!(r, e);
+    }
 
-        let a: u8x8 = u8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let b: u8x8 = u8x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
-        let e: u8x8 = u8x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
-        let r: u8x8 = transmute(vand_u8(transmute(a), transmute(b)));
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabdl_u16() {
+        let a: u16x4 = u16x4::new(1, 2, 3, 4);
+        let b: u16x4 = u16x4::new(10, 10, 10, 10);
+        let e: u32x4 = u32x4::new(9, 8, 7, 6);
+        let r: u32x4 = transmute(vabdl_u16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vandq_u8() {
-        let a: u8x16 = u8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x00);
-        let b: u8x16 = u8x16::new(0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F);
-        let e: u8x16 = u8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x00);
-        let r: u8x16 = transmute(vandq_u8(transmute(a), transmute(b)));
+    unsafe fn test_vabdl_u32() {
+        let a: u32x2 = u32x2::new(1, 2);
+        let b: u32x2 = u32x2::new(10, 10);
+        let e: u64x2 = u64x2::new(9, 8);
+        let r: u64x2 = transmute(vabdl_u32(transmute(a), transmute(b)));
         assert_eq!(r, e);
+    }
 
-        let a: u8x16 = u8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x00);
-        let b: u8x16 = u8x16::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
-        let e: u8x16 = u8x16::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
-        let r: u8x16 = transmute(vandq_u8(transmute(a), transmute(b)));
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabdl_s8() {
+        let a: i8x8 = i8x8::new(1, 2, 3, 4, 4, 3, 2, 1);
+        let b: i8x8 = i8x8::new(10, 10, 10, 10, 10, 10, 10, 10);
+        let e: i16x8 = i16x8::new(9, 8, 7, 6, 6, 7, 8, 9);
+        let r: i16x8 = transmute(vabdl_s8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vand_u16() {
-        let a: u16x4 = u16x4::new(0x00, 0x01, 0x02, 0x03);
-        let b: u16x4 = u16x4::new(0x0F, 0x0F, 0x0F, 0x0F);
-        let e: u16x4 = u16x4::new(0x00, 0x01, 0x02, 0x03);
-        let r: u16x4 = transmute(vand_u16(transmute(a), transmute(b)));
+    unsafe fn test_vabdl_s16() {
+        let a: i16x4 = i16x4::new(1, 2, 11, 12);
+        let b: i16x4 = i16x4::new(10, 10, 10, 10);
+        let e: i32x4 = i32x4::new(9, 8, 1, 2);
+        let r: i32x4 = transmute(vabdl_s16(transmute(a), transmute(b)));
         assert_eq!(r, e);
+    }
 
-        let a: u16x4 = u16x4::new(0x00, 0x01, 0x02, 0x03);
-        let b: u16x4 = u16x4::new(0x00, 0x00, 0x00, 0x00);
-        let e: u16x4 = u16x4::new(0x00, 0x00, 0x00, 0x00);
-        let r: u16x4 = transmute(vand_u16(transmute(a), transmute(b)));
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabdl_s32() {
+        let a: i32x2 = i32x2::new(1, 11);
+        let b: i32x2 = i32x2::new(10, 10);
+        let e: i64x2 = i64x2::new(9, 1);
+        let r: i64x2 = transmute(vabdl_s32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vandq_u16() {
-        let a: u16x8 = u16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let b: u16x8 = u16x8::new(0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F);
-        let e: u16x8 = u16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let r: u16x8 = transmute(vandq_u16(transmute(a), transmute(b)));
+    unsafe fn test_vceq_u8() {
+        let a: u8x8 = u8x8::new(0, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: u8x8 = u8x8::new(0, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let e: u8x8 = u8x8::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x8 = transmute(vceq_u8(transmute(a), transmute(b)));
         assert_eq!(r, e);
 
-        let a: u16x8 = u16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let b: u16x8 = u16x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
-        let e: u16x8 = u16x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
-        let r: u16x8 = transmute(vandq_u16(transmute(a), transmute(b)));
+        let a: u8x8 = u8x8::new(0, 0, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: u8x8 = u8x8::new(0, 0xFF, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08);
+        let e: u8x8 = u8x8::new(0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0);
+        let r: u8x8 = transmute(vceq_u8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vand_u32() {
-        let a: u32x2 = u32x2::new(0x00, 0x01);
-        let b: u32x2 = u32x2::new(0x0F, 0x0F);
-        let e: u32x2 = u32x2::new(0x00, 0x01);
-        let r: u32x2 = transmute(vand_u32(transmute(a), transmute(b)));
+    unsafe fn test_vceqq_u8() {
+        let a: u8x16 = u8x16::new(0, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0xFF);
+        let b: u8x16 = u8x16::new(0, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0xFF);
+        let e: u8x16 = u8x16::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x16 = transmute(vceqq_u8(transmute(a), transmute(b)));
         assert_eq!(r, e);
 
-        let a: u32x2 = u32x2::new(0x00, 0x01);
-        let b: u32x2 = u32x2::new(0x00, 0x00);
-        let e: u32x2 = u32x2::new(0x00, 0x00);
-        let r: u32x2 = transmute(vand_u32(transmute(a), transmute(b)));
+        let a: u8x16 = u8x16::new(0, 0, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xCC, 0x0D, 0xEE, 0xFF);
+        let b: u8x16 = u8x16::new(0, 0xFF, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08, 0x08, 0x00, 0x0A, 0x0A, 0xCC, 0xD0, 0xEE, 0);
+        let e: u8x16 = u8x16::new(0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0);
+        let r: u8x16 = transmute(vceqq_u8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vandq_u32() {
-        let a: u32x4 = u32x4::new(0x00, 0x01, 0x02, 0x03);
-        let b: u32x4 = u32x4::new(0x0F, 0x0F, 0x0F, 0x0F);
-        let e: u32x4 = u32x4::new(0x00, 0x01, 0x02, 0x03);
-        let r: u32x4 = transmute(vandq_u32(transmute(a), transmute(b)));
+    unsafe fn test_vceq_u16() {
+        let a: u16x4 = u16x4::new(0, 0x01, 0x02, 0x03);
+        let b: u16x4 = u16x4::new(0, 0x01, 0x02, 0x03);
+        let e: u16x4 = u16x4::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x4 = transmute(vceq_u16(transmute(a), transmute(b)));
         assert_eq!(r, e);
 
-        let a: u32x4 = u32x4::new(0x00, 0x01, 0x02, 0x03);
-        let b: u32x4 = u32x4::new(0x00, 0x00, 0x00, 0x00);
-        let e: u32x4 = u32x4::new(0x00, 0x00, 0x00, 0x00);
-        let r: u32x4 = transmute(vandq_u32(transmute(a), transmute(b)));
+        let a: u16x4 = u16x4::new(0, 0, 0x02, 0x03);
+        let b: u16x4 = u16x4::new(0, 0xFF_FF, 0x02, 0x04);
+        let e: u16x4 = u16x4::new(0xFF_FF, 0, 0xFF_FF, 0);
+        let r: u16x4 = transmute(vceq_u16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vand_s64() {
-        let a: i64x1 = i64x1::new(0x00);
-        let b: i64x1 = i64x1::new(0x0F);
-        let e: i64x1 = i64x1::new(0x00);
-        let r: i64x1 = transmute(vand_s64(transmute(a), transmute(b)));
+    unsafe fn test_vceqq_u16() {
+        let a: u16x8 = u16x8::new(0, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: u16x8 = u16x8::new(0, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let e: u16x8 = u16x8::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x8 = transmute(vceqq_u16(transmute(a), transmute(b)));
         assert_eq!(r, e);
 
-        let a: i64x1 = i64x1::new(0x00);
-        let b: i64x1 = i64x1::new(0x00);
-        let e: i64x1 = i64x1::new(0x00);
-        let r: i64x1 = transmute(vand_s64(transmute(a), transmute(b)));
+        let a: u16x8 = u16x8::new(0, 0, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: u16x8 = u16x8::new(0, 0xFF_FF, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08);
+        let e: u16x8 = u16x8::new(0xFF_FF, 0, 0xFF_FF, 0, 0xFF_FF, 0, 0xFF_FF, 0);
+        let r: u16x8 = transmute(vceqq_u16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vandq_s64() {
-        let a: i64x2 = i64x2::new(0x00, 0x01);
-        let b: i64x2 = i64x2::new(0x0F, 0x0F);
-        let e: i64x2 = i64x2::new(0x00, 0x01);
-        let r: i64x2 = transmute(vandq_s64(transmute(a), transmute(b)));
+    unsafe fn test_vceq_u32() {
+        let a: u32x2 = u32x2::new(0, 0x01);
+        let b: u32x2 = u32x2::new(0, 0x01);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x2 = transmute(vceq_u32(transmute(a), transmute(b)));
         assert_eq!(r, e);
 
-        let a: i64x2 = i64x2::new(0x00, 0x01);
-        let b: i64x2 = i64x2::new(0x00, 0x00);
-        let e: i64x2 = i64x2::new(0x00, 0x00);
-        let r: i64x2 = transmute(vandq_s64(transmute(a), transmute(b)));
+        let a: u32x2 = u32x2::new(0, 0);
+        let b: u32x2 = u32x2::new(0, 0xFF_FF_FF_FF);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0);
+        let r: u32x2 = transmute(vceq_u32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vand_u64() {
-        let a: u64x1 = u64x1::new(0x00);
-        let b: u64x1 = u64x1::new(0x0F);
-        let e: u64x1 = u64x1::new(0x00);
-        let r: u64x1 = transmute(vand_u64(transmute(a), transmute(b)));
+    unsafe fn test_vceqq_u32() {
+        let a: u32x4 = u32x4::new(0, 0x01, 0x02, 0x03);
+        let b: u32x4 = u32x4::new(0, 0x01, 0x02, 0x03);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x4 = transmute(vceqq_u32(transmute(a), transmute(b)));
         assert_eq!(r, e);
 
-        let a: u64x1 = u64x1::new(0x00);
-        let b: u64x1 = u64x1::new(0x00);
-        let e: u64x1 = u64x1::new(0x00);
-        let r: u64x1 = transmute(vand_u64(transmute(a), transmute(b)));
+        let a: u32x4 = u32x4::new(0, 0, 0x02, 0x03);
+        let b: u32x4 = u32x4::new(0, 0xFF_FF_FF_FF, 0x02, 0x04);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0, 0xFF_FF_FF_FF, 0);
+        let r: u32x4 = transmute(vceqq_u32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vandq_u64() {
-        let a: u64x2 = u64x2::new(0x00, 0x01);
-        let b: u64x2 = u64x2::new(0x0F, 0x0F);
-        let e: u64x2 = u64x2::new(0x00, 0x01);
-        let r: u64x2 = transmute(vandq_u64(transmute(a), transmute(b)));
+    unsafe fn test_vceq_s8() {
+        let a: i8x8 = i8x8::new(-128, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: i8x8 = i8x8::new(-128, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let e: u8x8 = u8x8::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x8 = transmute(vceq_s8(transmute(a), transmute(b)));
         assert_eq!(r, e);
 
-        let a: u64x2 = u64x2::new(0x00, 0x01);
-        let b: u64x2 = u64x2::new(0x00, 0x00);
-        let e: u64x2 = u64x2::new(0x00, 0x00);
-        let r: u64x2 = transmute(vandq_u64(transmute(a), transmute(b)));
+        let a: i8x8 = i8x8::new(-128, -128, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: i8x8 = i8x8::new(-128, 0x7F, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08);
+        let e: u8x8 = u8x8::new(0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0);
+        let r: u8x8 = transmute(vceq_s8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vorr_s8() {
-        let a: i8x8 = i8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let b: i8x8 = i8x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
-        let e: i8x8 = i8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let r: i8x8 = transmute(vorr_s8(transmute(a), transmute(b)));
+    unsafe fn test_vceqq_s8() {
+        let a: i8x16 = i8x16::new(-128, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x7F);
+        let b: i8x16 = i8x16::new(-128, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x7F);
+        let e: u8x16 = u8x16::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x16 = transmute(vceqq_s8(transmute(a), transmute(b)));
         assert_eq!(r, e);
-    }
 
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vorrq_s8() {
-        let a: i8x16 = i8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
-        let b: i8x16 = i8x16::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
-        let e: i8x16 = i8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
-        let r: i8x16 = transmute(vorrq_s8(transmute(a), transmute(b)));
+        let a: i8x16 = i8x16::new(-128, -128, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xCC, 0x0D, 0xEE, 0x7F);
+        let b: i8x16 = i8x16::new(-128, 0x7F, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08, 0x08, 0x00, 0x0A, 0x0A, 0xCC, 0xD0, 0xEE, -128);
+        let e: u8x16 = u8x16::new(0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0);
+        let r: u8x16 = transmute(vceqq_s8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vorr_s16() {
-        let a: i16x4 = i16x4::new(0x00, 0x01, 0x02, 0x03);
-        let b: i16x4 = i16x4::new(0x00, 0x00, 0x00, 0x00);
-        let e: i16x4 = i16x4::new(0x00, 0x01, 0x02, 0x03);
-        let r: i16x4 = transmute(vorr_s16(transmute(a), transmute(b)));
+    unsafe fn test_vceq_s16() {
+        let a: i16x4 = i16x4::new(-32768, 0x01, 0x02, 0x03);
+        let b: i16x4 = i16x4::new(-32768, 0x01, 0x02, 0x03);
+        let e: u16x4 = u16x4::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x4 = transmute(vceq_s16(transmute(a), transmute(b)));
         assert_eq!(r, e);
-    }
 
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vorrq_s16() {
-        let a: i16x8 = i16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let b: i16x8 = i16x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
-        let e: i16x8 = i16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let r: i16x8 = transmute(vorrq_s16(transmute(a), transmute(b)));
+        let a: i16x4 = i16x4::new(-32768, -32768, 0x02, 0x03);
+        let b: i16x4 = i16x4::new(-32768, 0x7F_FF, 0x02, 0x04);
+        let e: u16x4 = u16x4::new(0xFF_FF, 0, 0xFF_FF, 0);
+        let r: u16x4 = transmute(vceq_s16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vorr_s32() {
-        let a: i32x2 = i32x2::new(0x00, 0x01);
-        let b: i32x2 = i32x2::new(0x00, 0x00);
-        let e: i32x2 = i32x2::new(0x00, 0x01);
-        let r: i32x2 = transmute(vorr_s32(transmute(a), transmute(b)));
+    unsafe fn test_vceqq_s16() {
+        let a: i16x8 = i16x8::new(-32768, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: i16x8 = i16x8::new(-32768, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let e: u16x8 = u16x8::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x8 = transmute(vceqq_s16(transmute(a), transmute(b)));
         assert_eq!(r, e);
-    }
 
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vorrq_s32() {
-        let a: i32x4 = i32x4::new(0x00, 0x01, 0x02, 0x03);
-        let b: i32x4 = i32x4::new(0x00, 0x00, 0x00, 0x00);
-        let e: i32x4 = i32x4::new(0x00, 0x01, 0x02, 0x03);
-        let r: i32x4 = transmute(vorrq_s32(transmute(a), transmute(b)));
+        let a: i16x8 = i16x8::new(-32768, -32768, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: i16x8 = i16x8::new(-32768, 0x7F_FF, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08);
+        let e: u16x8 = u16x8::new(0xFF_FF, 0, 0xFF_FF, 0, 0xFF_FF, 0, 0xFF_FF, 0);
+        let r: u16x8 = transmute(vceqq_s16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vorr_u8() {
-        let a: u8x8 = u8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let b: u8x8 = u8x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
-        let e: u8x8 = u8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let r: u8x8 = transmute(vorr_u8(transmute(a), transmute(b)));
+    unsafe fn test_vceq_s32() {
+        let a: i32x2 = i32x2::new(-2147483648, 0x01);
+        let b: i32x2 = i32x2::new(-2147483648, 0x01);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x2 = transmute(vceq_s32(transmute(a), transmute(b)));
         assert_eq!(r, e);
-    }
 
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vorrq_u8() {
-        let a: u8x16 = u8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
-        let b: u8x16 = u8x16::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
-        let e: u8x16 = u8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
-        let r: u8x16 = transmute(vorrq_u8(transmute(a), transmute(b)));
+        let a: i32x2 = i32x2::new(-2147483648, -2147483648);
+        let b: i32x2 = i32x2::new(-2147483648, 0x7F_FF_FF_FF);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0);
+        let r: u32x2 = transmute(vceq_s32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vorr_u16() {
-        let a: u16x4 = u16x4::new(0x00, 0x01, 0x02, 0x03);
-        let b: u16x4 = u16x4::new(0x00, 0x00, 0x00, 0x00);
-        let e: u16x4 = u16x4::new(0x00, 0x01, 0x02, 0x03);
-        let r: u16x4 = transmute(vorr_u16(transmute(a), transmute(b)));
+    unsafe fn test_vceqq_s32() {
+        let a: i32x4 = i32x4::new(-2147483648, 0x01, 0x02, 0x03);
+        let b: i32x4 = i32x4::new(-2147483648, 0x01, 0x02, 0x03);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x4 = transmute(vceqq_s32(transmute(a), transmute(b)));
         assert_eq!(r, e);
-    }
 
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vorrq_u16() {
-        let a: u16x8 = u16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let b: u16x8 = u16x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
-        let e: u16x8 = u16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let r: u16x8 = transmute(vorrq_u16(transmute(a), transmute(b)));
+        let a: i32x4 = i32x4::new(-2147483648, -2147483648, 0x02, 0x03);
+        let b: i32x4 = i32x4::new(-2147483648, 0x7F_FF_FF_FF, 0x02, 0x04);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0, 0xFF_FF_FF_FF, 0);
+        let r: u32x4 = transmute(vceqq_s32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vorr_u32() {
-        let a: u32x2 = u32x2::new(0x00, 0x01);
-        let b: u32x2 = u32x2::new(0x00, 0x00);
-        let e: u32x2 = u32x2::new(0x00, 0x01);
-        let r: u32x2 = transmute(vorr_u32(transmute(a), transmute(b)));
+    unsafe fn test_vceq_p8() {
+        let a: i8x8 = i8x8::new(-128, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: i8x8 = i8x8::new(-128, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let e: u8x8 = u8x8::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x8 = transmute(vceq_p8(transmute(a), transmute(b)));
         assert_eq!(r, e);
-    }
 
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vorrq_u32() {
-        let a: u32x4 = u32x4::new(0x00, 0x01, 0x02, 0x03);
-        let b: u32x4 = u32x4::new(0x00, 0x00, 0x00, 0x00);
-        let e: u32x4 = u32x4::new(0x00, 0x01, 0x02, 0x03);
-        let r: u32x4 = transmute(vorrq_u32(transmute(a), transmute(b)));
+        let a: i8x8 = i8x8::new(-128, -128, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: i8x8 = i8x8::new(-128, 0x7F, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08);
+        let e: u8x8 = u8x8::new(0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0);
+        let r: u8x8 = transmute(vceq_p8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vorr_s64() {
-        let a: i64x1 = i64x1::new(0x00);
-        let b: i64x1 = i64x1::new(0x00);
-        let e: i64x1 = i64x1::new(0x00);
-        let r: i64x1 = transmute(vorr_s64(transmute(a), transmute(b)));
+    unsafe fn test_vceqq_p8() {
+        let a: i8x16 = i8x16::new(-128, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x7F);
+        let b: i8x16 = i8x16::new(-128, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x7F);
+        let e: u8x16 = u8x16::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x16 = transmute(vceqq_p8(transmute(a), transmute(b)));
         assert_eq!(r, e);
-    }
 
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vorrq_s64() {
-        let a: i64x2 = i64x2::new(0x00, 0x01);
-        let b: i64x2 = i64x2::new(0x00, 0x00);
-        let e: i64x2 = i64x2::new(0x00, 0x01);
-        let r: i64x2 = transmute(vorrq_s64(transmute(a), transmute(b)));
+        let a: i8x16 = i8x16::new(-128, -128, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xCC, 0x0D, 0xEE, 0x7F);
+        let b: i8x16 = i8x16::new(-128, 0x7F, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08, 0x08, 0x00, 0x0A, 0x0A, 0xCC, 0xD0, 0xEE, -128);
+        let e: u8x16 = u8x16::new(0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0);
+        let r: u8x16 = transmute(vceqq_p8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vorr_u64() {
-        let a: u64x1 = u64x1::new(0x00);
-        let b: u64x1 = u64x1::new(0x00);
-        let e: u64x1 = u64x1::new(0x00);
-        let r: u64x1 = transmute(vorr_u64(transmute(a), transmute(b)));
+    unsafe fn test_vceq_f32() {
+        let a: f32x2 = f32x2::new(1.2, 3.4);
+        let b: f32x2 = f32x2::new(1.2, 3.4);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x2 = transmute(vceq_f32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vorrq_u64() {
-        let a: u64x2 = u64x2::new(0x00, 0x01);
-        let b: u64x2 = u64x2::new(0x00, 0x00);
-        let e: u64x2 = u64x2::new(0x00, 0x01);
-        let r: u64x2 = transmute(vorrq_u64(transmute(a), transmute(b)));
+    unsafe fn test_vceqq_f32() {
+        let a: f32x4 = f32x4::new(1.2, 3.4, 5.6, 7.8);
+        let b: f32x4 = f32x4::new(1.2, 3.4, 5.6, 7.8);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x4 = transmute(vceqq_f32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_veor_s8() {
-        let a: i8x8 = i8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let b: i8x8 = i8x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
-        let e: i8x8 = i8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let r: i8x8 = transmute(veor_s8(transmute(a), transmute(b)));
+    unsafe fn test_vtst_s8() {
+        let a: i8x8 = i8x8::new(-128, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
+        let b: i8x8 = i8x8::new(-128, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
+        let e: u8x8 = u8x8::new(0xFF, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x8 = transmute(vtst_s8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_veorq_s8() {
-        let a: i8x16 = i8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
-        let b: i8x16 = i8x16::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
-        let e: i8x16 = i8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
-        let r: i8x16 = transmute(veorq_s8(transmute(a), transmute(b)));
+    unsafe fn test_vtstq_s8() {
+        let a: i8x16 = i8x16::new(-128, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x7F);
+        let b: i8x16 = i8x16::new(-128, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x7F);
+        let e: u8x16 = u8x16::new(0xFF, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x16 = transmute(vtstq_s8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_veor_s16() {
-        let a: i16x4 = i16x4::new(0x00, 0x01, 0x02, 0x03);
-        let b: i16x4 = i16x4::new(0x00, 0x00, 0x00, 0x00);
-        let e: i16x4 = i16x4::new(0x00, 0x01, 0x02, 0x03);
-        let r: i16x4 = transmute(veor_s16(transmute(a), transmute(b)));
+    unsafe fn test_vtst_s16() {
+        let a: i16x4 = i16x4::new(-32768, 0x00, 0x01, 0x02);
+        let b: i16x4 = i16x4::new(-32768, 0x00, 0x01, 0x02);
+        let e: u16x4 = u16x4::new(0xFF_FF, 0, 0xFF_FF, 0xFF_FF);
+        let r: u16x4 = transmute(vtst_s16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_veorq_s16() {
-        let a: i16x8 = i16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let b: i16x8 = i16x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
-        let e: i16x8 = i16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let r: i16x8 = transmute(veorq_s16(transmute(a), transmute(b)));
+    unsafe fn test_vtstq_s16() {
+        let a: i16x8 = i16x8::new(-32768, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
+        let b: i16x8 = i16x8::new(-32768, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
+        let e: u16x8 = u16x8::new(0xFF_FF, 0, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x8 = transmute(vtstq_s16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_veor_s32() {
-        let a: i32x2 = i32x2::new(0x00, 0x01);
-        let b: i32x2 = i32x2::new(0x00, 0x00);
-        let e: i32x2 = i32x2::new(0x00, 0x01);
-        let r: i32x2 = transmute(veor_s32(transmute(a), transmute(b)));
+    unsafe fn test_vtst_s32() {
+        let a: i32x2 = i32x2::new(-2147483648, 0x00);
+        let b: i32x2 = i32x2::new(-2147483648, 0x00);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0);
+        let r: u32x2 = transmute(vtst_s32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_veorq_s32() {
-        let a: i32x4 = i32x4::new(0x00, 0x01, 0x02, 0x03);
-        let b: i32x4 = i32x4::new(0x00, 0x00, 0x00, 0x00);
-        let e: i32x4 = i32x4::new(0x00, 0x01, 0x02, 0x03);
-        let r: i32x4 = transmute(veorq_s32(transmute(a), transmute(b)));
+    unsafe fn test_vtstq_s32() {
+        let a: i32x4 = i32x4::new(-2147483648, 0x00, 0x01, 0x02);
+        let b: i32x4 = i32x4::new(-2147483648, 0x00, 0x01, 0x02);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x4 = transmute(vtstq_s32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_veor_u8() {
-        let a: u8x8 = u8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let b: u8x8 = u8x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
-        let e: u8x8 = u8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let r: u8x8 = transmute(veor_u8(transmute(a), transmute(b)));
+    unsafe fn test_vtst_p8() {
+        let a: i8x8 = i8x8::new(-128, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
+        let b: i8x8 = i8x8::new(-128, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
+        let e: u8x8 = u8x8::new(0xFF, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x8 = transmute(vtst_p8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_veorq_u8() {
-        let a: u8x16 = u8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
-        let b: u8x16 = u8x16::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
-        let e: u8x16 = u8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
-        let r: u8x16 = transmute(veorq_u8(transmute(a), transmute(b)));
+    unsafe fn test_vtstq_p8() {
+        let a: i8x16 = i8x16::new(-128, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x7F);
+        let b: i8x16 = i8x16::new(-128, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x7F);
+        let e: u8x16 = u8x16::new(0xFF, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x16 = transmute(vtstq_p8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_veor_u16() {
-        let a: u16x4 = u16x4::new(0x00, 0x01, 0x02, 0x03);
-        let b: u16x4 = u16x4::new(0x00, 0x00, 0x00, 0x00);
-        let e: u16x4 = u16x4::new(0x00, 0x01, 0x02, 0x03);
-        let r: u16x4 = transmute(veor_u16(transmute(a), transmute(b)));
+    unsafe fn test_vtst_u8() {
+        let a: u8x8 = u8x8::new(0, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
+        let b: u8x8 = u8x8::new(0, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
+        let e: u8x8 = u8x8::new(0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x8 = transmute(vtst_u8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_veorq_u16() {
-        let a: u16x8 = u16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let b: u16x8 = u16x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
-        let e: u16x8 = u16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let r: u16x8 = transmute(veorq_u16(transmute(a), transmute(b)));
+    unsafe fn test_vtstq_u8() {
+        let a: u8x16 = u8x16::new(0, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0xFF);
+        let b: u8x16 = u8x16::new(0, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0xFF);
+        let e: u8x16 = u8x16::new(0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x16 = transmute(vtstq_u8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_veor_u32() {
-        let a: u32x2 = u32x2::new(0x00, 0x01);
-        let b: u32x2 = u32x2::new(0x00, 0x00);
-        let e: u32x2 = u32x2::new(0x00, 0x01);
-        let r: u32x2 = transmute(veor_u32(transmute(a), transmute(b)));
+    unsafe fn test_vtst_u16() {
+        let a: u16x4 = u16x4::new(0, 0x00, 0x01, 0x02);
+        let b: u16x4 = u16x4::new(0, 0x00, 0x01, 0x02);
+        let e: u16x4 = u16x4::new(0, 0, 0xFF_FF, 0xFF_FF);
+        let r: u16x4 = transmute(vtst_u16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_veorq_u32() {
-        let a: u32x4 = u32x4::new(0x00, 0x01, 0x02, 0x03);
-        let b: u32x4 = u32x4::new(0x00, 0x00, 0x00, 0x00);
-        let e: u32x4 = u32x4::new(0x00, 0x01, 0x02, 0x03);
-        let r: u32x4 = transmute(veorq_u32(transmute(a), transmute(b)));
+    unsafe fn test_vtstq_u16() {
+        let a: u16x8 = u16x8::new(0, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
+        let b: u16x8 = u16x8::new(0, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
+        let e: u16x8 = u16x8::new(0, 0, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x8 = transmute(vtstq_u16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_veor_s64() {
-        let a: i64x1 = i64x1::new(0x00);
-        let b: i64x1 = i64x1::new(0x00);
-        let e: i64x1 = i64x1::new(0x00);
-        let r: i64x1 = transmute(veor_s64(transmute(a), transmute(b)));
+    unsafe fn test_vtst_u32() {
+        let a: u32x2 = u32x2::new(0, 0x00);
+        let b: u32x2 = u32x2::new(0, 0x00);
+        let e: u32x2 = u32x2::new(0, 0);
+        let r: u32x2 = transmute(vtst_u32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_veorq_s64() {
-        let a: i64x2 = i64x2::new(0x00, 0x01);
-        let b: i64x2 = i64x2::new(0x00, 0x00);
-        let e: i64x2 = i64x2::new(0x00, 0x01);
-        let r: i64x2 = transmute(veorq_s64(transmute(a), transmute(b)));
+    unsafe fn test_vtstq_u32() {
+        let a: u32x4 = u32x4::new(0, 0x00, 0x01, 0x02);
+        let b: u32x4 = u32x4::new(0, 0x00, 0x01, 0x02);
+        let e: u32x4 = u32x4::new(0, 0, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x4 = transmute(vtstq_u32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_veor_u64() {
-        let a: u64x1 = u64x1::new(0x00);
-        let b: u64x1 = u64x1::new(0x00);
-        let e: u64x1 = u64x1::new(0x00);
-        let r: u64x1 = transmute(veor_u64(transmute(a), transmute(b)));
+    unsafe fn test_vabs_f32() {
+        let a: f32x2 = f32x2::new(-0.1, -2.2);
+        let e: f32x2 = f32x2::new(0.1, 2.2);
+        let r: f32x2 = transmute(vabs_f32(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_veorq_u64() {
-        let a: u64x2 = u64x2::new(0x00, 0x01);
-        let b: u64x2 = u64x2::new(0x00, 0x00);
-        let e: u64x2 = u64x2::new(0x00, 0x01);
-        let r: u64x2 = transmute(veorq_u64(transmute(a), transmute(b)));
+    unsafe fn test_vabsq_f32() {
+        let a: f32x4 = f32x4::new(-0.1, -2.2, -3.3, -6.6);
+        let e: f32x4 = f32x4::new(0.1, 2.2, 3.3, 6.6);
+        let r: f32x4 = transmute(vabsq_f32(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vabd_s8() {
+    unsafe fn test_vcgt_s8() {
         let a: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let b: i8x8 = i8x8::new(16, 15, 14, 13, 12, 11, 10, 9);
-        let e: i8x8 = i8x8::new(15, 13, 11, 9, 7, 5, 3, 1);
-        let r: i8x8 = transmute(vabd_s8(transmute(a), transmute(b)));
+        let b: i8x8 = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: u8x8 = u8x8::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x8 = transmute(vcgt_s8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vabdq_s8() {
+    unsafe fn test_vcgtq_s8() {
         let a: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let b: i8x16 = i8x16::new(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
-        let e: i8x16 = i8x16::new(15, 13, 11, 9, 7, 5, 3, 1, 1, 3, 5, 7, 9, 11, 13, 15);
-        let r: i8x16 = transmute(vabdq_s8(transmute(a), transmute(b)));
+        let b: i8x16 = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: u8x16 = u8x16::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x16 = transmute(vcgtq_s8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vabd_s16() {
+    unsafe fn test_vcgt_s16() {
         let a: i16x4 = i16x4::new(1, 2, 3, 4);
-        let b: i16x4 = i16x4::new(16, 15, 14, 13);
-        let e: i16x4 = i16x4::new(15, 13, 11, 9);
-        let r: i16x4 = transmute(vabd_s16(transmute(a), transmute(b)));
+        let b: i16x4 = i16x4::new(0, 1, 2, 3);
+        let e: u16x4 = u16x4::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x4 = transmute(vcgt_s16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vabdq_s16() {
+    unsafe fn test_vcgtq_s16() {
         let a: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let b: i16x8 = i16x8::new(16, 15, 14, 13, 12, 11, 10, 9);
-        let e: i16x8 = i16x8::new(15, 13, 11, 9, 7, 5, 3, 1);
-        let r: i16x8 = transmute(vabdq_s16(transmute(a), transmute(b)));
+        let b: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: u16x8 = u16x8::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x8 = transmute(vcgtq_s16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vabd_s32() {
+    unsafe fn test_vcgt_s32() {
         let a: i32x2 = i32x2::new(1, 2);
-        let b: i32x2 = i32x2::new(16, 15);
-        let e: i32x2 = i32x2::new(15, 13);
-        let r: i32x2 = transmute(vabd_s32(transmute(a), transmute(b)));
+        let b: i32x2 = i32x2::new(0, 1);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x2 = transmute(vcgt_s32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vabdq_s32() {
+    unsafe fn test_vcgtq_s32() {
         let a: i32x4 = i32x4::new(1, 2, 3, 4);
-        let b: i32x4 = i32x4::new(16, 15, 14, 13);
-        let e: i32x4 = i32x4::new(15, 13, 11, 9);
-        let r: i32x4 = transmute(vabdq_s32(transmute(a), transmute(b)));
+        let b: i32x4 = i32x4::new(0, 1, 2, 3);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x4 = transmute(vcgtq_s32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vabd_u8() {
+    unsafe fn test_vcgt_u8() {
         let a: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let b: u8x8 = u8x8::new(16, 15, 14, 13, 12, 11, 10, 9);
-        let e: u8x8 = u8x8::new(15, 13, 11, 9, 7, 5, 3, 1);
-        let r: u8x8 = transmute(vabd_u8(transmute(a), transmute(b)));
+        let b: u8x8 = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: u8x8 = u8x8::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x8 = transmute(vcgt_u8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vabdq_u8() {
+    unsafe fn test_vcgtq_u8() {
         let a: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let b: u8x16 = u8x16::new(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
-        let e: u8x16 = u8x16::new(15, 13, 11, 9, 7, 5, 3, 1, 1, 3, 5, 7, 9, 11, 13, 15);
-        let r: u8x16 = transmute(vabdq_u8(transmute(a), transmute(b)));
+        let b: u8x16 = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: u8x16 = u8x16::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x16 = transmute(vcgtq_u8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vabd_u16() {
+    unsafe fn test_vcgt_u16() {
         let a: u16x4 = u16x4::new(1, 2, 3, 4);
-        let b: u16x4 = u16x4::new(16, 15, 14, 13);
-        let e: u16x4 = u16x4::new(15, 13, 11, 9);
-        let r: u16x4 = transmute(vabd_u16(transmute(a), transmute(b)));
+        let b: u16x4 = u16x4::new(0, 1, 2, 3);
+        let e: u16x4 = u16x4::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x4 = transmute(vcgt_u16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vabdq_u16() {
+    unsafe fn test_vcgtq_u16() {
         let a: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let b: u16x8 = u16x8::new(16, 15, 14, 13, 12, 11, 10, 9);
-        let e: u16x8 = u16x8::new(15, 13, 11, 9, 7, 5, 3, 1);
-        let r: u16x8 = transmute(vabdq_u16(transmute(a), transmute(b)));
+        let b: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: u16x8 = u16x8::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x8 = transmute(vcgtq_u16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vabd_u32() {
+    unsafe fn test_vcgt_u32() {
         let a: u32x2 = u32x2::new(1, 2);
-        let b: u32x2 = u32x2::new(16, 15);
-        let e: u32x2 = u32x2::new(15, 13);
-        let r: u32x2 = transmute(vabd_u32(transmute(a), transmute(b)));
+        let b: u32x2 = u32x2::new(0, 1);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x2 = transmute(vcgt_u32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vabdq_u32() {
+    unsafe fn test_vcgtq_u32() {
         let a: u32x4 = u32x4::new(1, 2, 3, 4);
-        let b: u32x4 = u32x4::new(16, 15, 14, 13);
-        let e: u32x4 = u32x4::new(15, 13, 11, 9);
-        let r: u32x4 = transmute(vabdq_u32(transmute(a), transmute(b)));
+        let b: u32x4 = u32x4::new(0, 1, 2, 3);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x4 = transmute(vcgtq_u32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vabd_f32() {
-        let a: f32x2 = f32x2::new(1.0, 2.0);
-        let b: f32x2 = f32x2::new(9.0, 3.0);
-        let e: f32x2 = f32x2::new(8.0, 1.0);
-        let r: f32x2 = transmute(vabd_f32(transmute(a), transmute(b)));
+    unsafe fn test_vcgt_f32() {
+        let a: f32x2 = f32x2::new(1.2, 2.3);
+        let b: f32x2 = f32x2::new(0.1, 1.2);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x2 = transmute(vcgt_f32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vabdq_f32() {
-        let a: f32x4 = f32x4::new(1.0, 2.0, 5.0, -4.0);
-        let b: f32x4 = f32x4::new(9.0, 3.0, 2.0, 8.0);
-        let e: f32x4 = f32x4::new(8.0, 1.0, 3.0, 12.0);
-        let r: f32x4 = transmute(vabdq_f32(transmute(a), transmute(b)));
+    unsafe fn test_vcgtq_f32() {
+        let a: f32x4 = f32x4::new(1.2, 2.3, 3.4, 4.5);
+        let b: f32x4 = f32x4::new(0.1, 1.2, 2.3, 3.4);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x4 = transmute(vcgtq_f32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vabdl_u8() {
-        let a: u8x8 = u8x8::new(1, 2, 3, 4, 4, 3, 2, 1);
-        let b: u8x8 = u8x8::new(10, 10, 10, 10, 10, 10, 10, 10);
-        let e: u16x8 = u16x8::new(9, 8, 7, 6, 6, 7, 8, 9);
-        let r: u16x8 = transmute(vabdl_u8(transmute(a), transmute(b)));
+    unsafe fn test_vclt_s8() {
+        let a: i8x8 = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: u8x8 = u8x8::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x8 = transmute(vclt_s8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vabdl_u16() {
-        let a: u16x4 = u16x4::new(1, 2, 3, 4);
-        let b: u16x4 = u16x4::new(10, 10, 10, 10);
-        let e: u32x4 = u32x4::new(9, 8, 7, 6);
-        let r: u32x4 = transmute(vabdl_u16(transmute(a), transmute(b)));
+    unsafe fn test_vcltq_s8() {
+        let a: i8x16 = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let e: u8x16 = u8x16::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x16 = transmute(vcltq_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclt_s16() {
+        let a: i16x4 = i16x4::new(0, 1, 2, 3);
+        let b: i16x4 = i16x4::new(1, 2, 3, 4);
+        let e: u16x4 = u16x4::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x4 = transmute(vclt_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcltq_s16() {
+        let a: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: u16x8 = u16x8::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x8 = transmute(vcltq_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclt_s32() {
+        let a: i32x2 = i32x2::new(0, 1);
+        let b: i32x2 = i32x2::new(1, 2);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x2 = transmute(vclt_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcltq_s32() {
+        let a: i32x4 = i32x4::new(0, 1, 2, 3);
+        let b: i32x4 = i32x4::new(1, 2, 3, 4);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x4 = transmute(vcltq_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclt_u8() {
+        let a: u8x8 = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: u8x8 = u8x8::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x8 = transmute(vclt_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcltq_u8() {
+        let a: u8x16 = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let e: u8x16 = u8x16::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x16 = transmute(vcltq_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclt_u16() {
+        let a: u16x4 = u16x4::new(0, 1, 2, 3);
+        let b: u16x4 = u16x4::new(1, 2, 3, 4);
+        let e: u16x4 = u16x4::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x4 = transmute(vclt_u16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vabdl_u32() {
-        let a: u32x2 = u32x2::new(1, 2);
-        let b: u32x2 = u32x2::new(10, 10);
-        let e: u64x2 = u64x2::new(9, 8);
-        let r: u64x2 = transmute(vabdl_u32(transmute(a), transmute(b)));
+    unsafe fn test_vcltq_u16() {
+        let a: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: u16x8 = u16x8::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x8 = transmute(vcltq_u16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vabdl_s8() {
-        let a: i8x8 = i8x8::new(1, 2, 3, 4, 4, 3, 2, 1);
-        let b: i8x8 = i8x8::new(10, 10, 10, 10, 10, 10, 10, 10);
-        let e: i16x8 = i16x8::new(9, 8, 7, 6, 6, 7, 8, 9);
-        let r: i16x8 = transmute(vabdl_s8(transmute(a), transmute(b)));
+    unsafe fn test_vclt_u32() {
+        let a: u32x2 = u32x2::new(0, 1);
+        let b: u32x2 = u32x2::new(1, 2);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x2 = transmute(vclt_u32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vabdl_s16() {
-        let a: i16x4 = i16x4::new(1, 2, 11, 12);
-        let b: i16x4 = i16x4::new(10, 10, 10, 10);
-        let e: i32x4 = i32x4::new(9, 8, 1, 2);
-        let r: i32x4 = transmute(vabdl_s16(transmute(a), transmute(b)));
+    unsafe fn test_vcltq_u32() {
+        let a: u32x4 = u32x4::new(0, 1, 2, 3);
+        let b: u32x4 = u32x4::new(1, 2, 3, 4);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x4 = transmute(vcltq_u32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vabdl_s32() {
-        let a: i32x2 = i32x2::new(1, 11);
-        let b: i32x2 = i32x2::new(10, 10);
-        let e: i64x2 = i64x2::new(9, 1);
-        let r: i64x2 = transmute(vabdl_s32(transmute(a), transmute(b)));
+    unsafe fn test_vclt_f32() {
+        let a: f32x2 = f32x2::new(0.1, 1.2);
+        let b: f32x2 = f32x2::new(1.2, 2.3);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x2 = transmute(vclt_f32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vceq_u8() {
-        let a: u8x8 = u8x8::new(0, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let b: u8x8 = u8x8::new(0, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let e: u8x8 = u8x8::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-        let r: u8x8 = transmute(vceq_u8(transmute(a), transmute(b)));
+    unsafe fn test_vcltq_f32() {
+        let a: f32x4 = f32x4::new(0.1, 1.2, 2.3, 3.4);
+        let b: f32x4 = f32x4::new(1.2, 2.3, 3.4, 4.5);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x4 = transmute(vcltq_f32(transmute(a), transmute(b)));
         assert_eq!(r, e);
+    }
 
-        let a: u8x8 = u8x8::new(0, 0, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let b: u8x8 = u8x8::new(0, 0xFF, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08);
-        let e: u8x8 = u8x8::new(0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0);
-        let r: u8x8 = transmute(vceq_u8(transmute(a), transmute(b)));
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcle_s8() {
+        let a: i8x8 = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: u8x8 = u8x8::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x8 = transmute(vcle_s8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vceqq_u8() {
-        let a: u8x16 = u8x16::new(0, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0xFF);
-        let b: u8x16 = u8x16::new(0, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0xFF);
+    unsafe fn test_vcleq_s8() {
+        let a: i8x16 = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
         let e: u8x16 = u8x16::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-        let r: u8x16 = transmute(vceqq_u8(transmute(a), transmute(b)));
-        assert_eq!(r, e);
-
-        let a: u8x16 = u8x16::new(0, 0, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xCC, 0x0D, 0xEE, 0xFF);
-        let b: u8x16 = u8x16::new(0, 0xFF, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08, 0x08, 0x00, 0x0A, 0x0A, 0xCC, 0xD0, 0xEE, 0);
-        let e: u8x16 = u8x16::new(0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0);
-        let r: u8x16 = transmute(vceqq_u8(transmute(a), transmute(b)));
+        let r: u8x16 = transmute(vcleq_s8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vceq_u16() {
-        let a: u16x4 = u16x4::new(0, 0x01, 0x02, 0x03);
-        let b: u16x4 = u16x4::new(0, 0x01, 0x02, 0x03);
+    unsafe fn test_vcle_s16() {
+        let a: i16x4 = i16x4::new(0, 1, 2, 3);
+        let b: i16x4 = i16x4::new(1, 2, 3, 4);
         let e: u16x4 = u16x4::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
-        let r: u16x4 = transmute(vceq_u16(transmute(a), transmute(b)));
-        assert_eq!(r, e);
-
-        let a: u16x4 = u16x4::new(0, 0, 0x02, 0x03);
-        let b: u16x4 = u16x4::new(0, 0xFF_FF, 0x02, 0x04);
-        let e: u16x4 = u16x4::new(0xFF_FF, 0, 0xFF_FF, 0);
-        let r: u16x4 = transmute(vceq_u16(transmute(a), transmute(b)));
+        let r: u16x4 = transmute(vcle_s16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vceqq_u16() {
-        let a: u16x8 = u16x8::new(0, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let b: u16x8 = u16x8::new(0, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+    unsafe fn test_vcleq_s16() {
+        let a: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
         let e: u16x8 = u16x8::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
-        let r: u16x8 = transmute(vceqq_u16(transmute(a), transmute(b)));
-        assert_eq!(r, e);
-
-        let a: u16x8 = u16x8::new(0, 0, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let b: u16x8 = u16x8::new(0, 0xFF_FF, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08);
-        let e: u16x8 = u16x8::new(0xFF_FF, 0, 0xFF_FF, 0, 0xFF_FF, 0, 0xFF_FF, 0);
-        let r: u16x8 = transmute(vceqq_u16(transmute(a), transmute(b)));
+        let r: u16x8 = transmute(vcleq_s16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vceq_u32() {
-        let a: u32x2 = u32x2::new(0, 0x01);
-        let b: u32x2 = u32x2::new(0, 0x01);
+    unsafe fn test_vcle_s32() {
+        let a: i32x2 = i32x2::new(0, 1);
+        let b: i32x2 = i32x2::new(1, 2);
         let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x2 = transmute(vceq_u32(transmute(a), transmute(b)));
-        assert_eq!(r, e);
-
-        let a: u32x2 = u32x2::new(0, 0);
-        let b: u32x2 = u32x2::new(0, 0xFF_FF_FF_FF);
-        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0);
-        let r: u32x2 = transmute(vceq_u32(transmute(a), transmute(b)));
+        let r: u32x2 = transmute(vcle_s32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vceqq_u32() {
-        let a: u32x4 = u32x4::new(0, 0x01, 0x02, 0x03);
-        let b: u32x4 = u32x4::new(0, 0x01, 0x02, 0x03);
+    unsafe fn test_vcleq_s32() {
+        let a: i32x4 = i32x4::new(0, 1, 2, 3);
+        let b: i32x4 = i32x4::new(1, 2, 3, 4);
         let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x4 = transmute(vceqq_u32(transmute(a), transmute(b)));
-        assert_eq!(r, e);
-
-        let a: u32x4 = u32x4::new(0, 0, 0x02, 0x03);
-        let b: u32x4 = u32x4::new(0, 0xFF_FF_FF_FF, 0x02, 0x04);
-        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0, 0xFF_FF_FF_FF, 0);
-        let r: u32x4 = transmute(vceqq_u32(transmute(a), transmute(b)));
+        let r: u32x4 = transmute(vcleq_s32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vceq_s8() {
-        let a: i8x8 = i8x8::new(-128, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let b: i8x8 = i8x8::new(-128, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+    unsafe fn test_vcle_u8() {
+        let a: u8x8 = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
         let e: u8x8 = u8x8::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-        let r: u8x8 = transmute(vceq_s8(transmute(a), transmute(b)));
-        assert_eq!(r, e);
-
-        let a: i8x8 = i8x8::new(-128, -128, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let b: i8x8 = i8x8::new(-128, 0x7F, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08);
-        let e: u8x8 = u8x8::new(0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0);
-        let r: u8x8 = transmute(vceq_s8(transmute(a), transmute(b)));
+        let r: u8x8 = transmute(vcle_u8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vceqq_s8() {
-        let a: i8x16 = i8x16::new(-128, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x7F);
-        let b: i8x16 = i8x16::new(-128, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x7F);
+    unsafe fn test_vcleq_u8() {
+        let a: u8x16 = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
         let e: u8x16 = u8x16::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-        let r: u8x16 = transmute(vceqq_s8(transmute(a), transmute(b)));
-        assert_eq!(r, e);
-
-        let a: i8x16 = i8x16::new(-128, -128, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xCC, 0x0D, 0xEE, 0x7F);
-        let b: i8x16 = i8x16::new(-128, 0x7F, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08, 0x08, 0x00, 0x0A, 0x0A, 0xCC, 0xD0, 0xEE, -128);
-        let e: u8x16 = u8x16::new(0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0);
-        let r: u8x16 = transmute(vceqq_s8(transmute(a), transmute(b)));
+        let r: u8x16 = transmute(vcleq_u8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vceq_s16() {
-        let a: i16x4 = i16x4::new(-32768, 0x01, 0x02, 0x03);
-        let b: i16x4 = i16x4::new(-32768, 0x01, 0x02, 0x03);
+    unsafe fn test_vcle_u16() {
+        let a: u16x4 = u16x4::new(0, 1, 2, 3);
+        let b: u16x4 = u16x4::new(1, 2, 3, 4);
         let e: u16x4 = u16x4::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
-        let r: u16x4 = transmute(vceq_s16(transmute(a), transmute(b)));
-        assert_eq!(r, e);
-
-        let a: i16x4 = i16x4::new(-32768, -32768, 0x02, 0x03);
-        let b: i16x4 = i16x4::new(-32768, 0x7F_FF, 0x02, 0x04);
-        let e: u16x4 = u16x4::new(0xFF_FF, 0, 0xFF_FF, 0);
-        let r: u16x4 = transmute(vceq_s16(transmute(a), transmute(b)));
+        let r: u16x4 = transmute(vcle_u16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vceqq_s16() {
-        let a: i16x8 = i16x8::new(-32768, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let b: i16x8 = i16x8::new(-32768, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+    unsafe fn test_vcleq_u16() {
+        let a: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
         let e: u16x8 = u16x8::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
-        let r: u16x8 = transmute(vceqq_s16(transmute(a), transmute(b)));
-        assert_eq!(r, e);
-
-        let a: i16x8 = i16x8::new(-32768, -32768, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let b: i16x8 = i16x8::new(-32768, 0x7F_FF, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08);
-        let e: u16x8 = u16x8::new(0xFF_FF, 0, 0xFF_FF, 0, 0xFF_FF, 0, 0xFF_FF, 0);
-        let r: u16x8 = transmute(vceqq_s16(transmute(a), transmute(b)));
+        let r: u16x8 = transmute(vcleq_u16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vceq_s32() {
-        let a: i32x2 = i32x2::new(-2147483648, 0x01);
-        let b: i32x2 = i32x2::new(-2147483648, 0x01);
+    unsafe fn test_vcle_u32() {
+        let a: u32x2 = u32x2::new(0, 1);
+        let b: u32x2 = u32x2::new(1, 2);
         let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x2 = transmute(vceq_s32(transmute(a), transmute(b)));
-        assert_eq!(r, e);
-
-        let a: i32x2 = i32x2::new(-2147483648, -2147483648);
-        let b: i32x2 = i32x2::new(-2147483648, 0x7F_FF_FF_FF);
-        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0);
-        let r: u32x2 = transmute(vceq_s32(transmute(a), transmute(b)));
+        let r: u32x2 = transmute(vcle_u32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vceqq_s32() {
-        let a: i32x4 = i32x4::new(-2147483648, 0x01, 0x02, 0x03);
-        let b: i32x4 = i32x4::new(-2147483648, 0x01, 0x02, 0x03);
+    unsafe fn test_vcleq_u32() {
+        let a: u32x4 = u32x4::new(0, 1, 2, 3);
+        let b: u32x4 = u32x4::new(1, 2, 3, 4);
         let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x4 = transmute(vceqq_s32(transmute(a), transmute(b)));
-        assert_eq!(r, e);
-
-        let a: i32x4 = i32x4::new(-2147483648, -2147483648, 0x02, 0x03);
-        let b: i32x4 = i32x4::new(-2147483648, 0x7F_FF_FF_FF, 0x02, 0x04);
-        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0, 0xFF_FF_FF_FF, 0);
-        let r: u32x4 = transmute(vceqq_s32(transmute(a), transmute(b)));
+        let r: u32x4 = transmute(vcleq_u32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vceq_p8() {
-        let a: i8x8 = i8x8::new(-128, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let b: i8x8 = i8x8::new(-128, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let e: u8x8 = u8x8::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-        let r: u8x8 = transmute(vceq_p8(transmute(a), transmute(b)));
-        assert_eq!(r, e);
-
-        let a: i8x8 = i8x8::new(-128, -128, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let b: i8x8 = i8x8::new(-128, 0x7F, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08);
-        let e: u8x8 = u8x8::new(0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0);
-        let r: u8x8 = transmute(vceq_p8(transmute(a), transmute(b)));
+    unsafe fn test_vcle_f32() {
+        let a: f32x2 = f32x2::new(0.1, 1.2);
+        let b: f32x2 = f32x2::new(1.2, 2.3);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x2 = transmute(vcle_f32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vceqq_p8() {
-        let a: i8x16 = i8x16::new(-128, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x7F);
-        let b: i8x16 = i8x16::new(-128, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x7F);
-        let e: u8x16 = u8x16::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-        let r: u8x16 = transmute(vceqq_p8(transmute(a), transmute(b)));
-        assert_eq!(r, e);
-
-        let a: i8x16 = i8x16::new(-128, -128, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xCC, 0x0D, 0xEE, 0x7F);
-        let b: i8x16 = i8x16::new(-128, 0x7F, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08, 0x08, 0x00, 0x0A, 0x0A, 0xCC, 0xD0, 0xEE, -128);
-        let e: u8x16 = u8x16::new(0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0);
-        let r: u8x16 = transmute(vceqq_p8(transmute(a), transmute(b)));
+    unsafe fn test_vcleq_f32() {
+        let a: f32x4 = f32x4::new(0.1, 1.2, 2.3, 3.4);
+        let b: f32x4 = f32x4::new(1.2, 2.3, 3.4, 4.5);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x4 = transmute(vcleq_f32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vceq_f32() {
-        let a: f32x2 = f32x2::new(1.2, 3.4);
-        let b: f32x2 = f32x2::new(1.2, 3.4);
-        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x2 = transmute(vceq_f32(transmute(a), transmute(b)));
+    unsafe fn test_vcge_s8() {
+        let a: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: i8x8 = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: u8x8 = u8x8::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x8 = transmute(vcge_s8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vceqq_f32() {
-        let a: f32x4 = f32x4::new(1.2, 3.4, 5.6, 7.8);
-        let b: f32x4 = f32x4::new(1.2, 3.4, 5.6, 7.8);
-        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x4 = transmute(vceqq_f32(transmute(a), transmute(b)));
+    unsafe fn test_vcgeq_s8() {
+        let a: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b: i8x16 = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: u8x16 = u8x16::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x16 = transmute(vcgeq_s8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vtst_s8() {
-        let a: i8x8 = i8x8::new(-128, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
-        let b: i8x8 = i8x8::new(-128, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
-        let e: u8x8 = u8x8::new(0xFF, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-        let r: u8x8 = transmute(vtst_s8(transmute(a), transmute(b)));
+    unsafe fn test_vcge_s16() {
+        let a: i16x4 = i16x4::new(1, 2, 3, 4);
+        let b: i16x4 = i16x4::new(0, 1, 2, 3);
+        let e: u16x4 = u16x4::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x4 = transmute(vcge_s16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vtstq_s8() {
-        let a: i8x16 = i8x16::new(-128, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x7F);
-        let b: i8x16 = i8x16::new(-128, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x7F);
-        let e: u8x16 = u8x16::new(0xFF, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-        let r: u8x16 = transmute(vtstq_s8(transmute(a), transmute(b)));
+    unsafe fn test_vcgeq_s16() {
+        let a: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: u16x8 = u16x8::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x8 = transmute(vcgeq_s16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vtst_s16() {
-        let a: i16x4 = i16x4::new(-32768, 0x00, 0x01, 0x02);
-        let b: i16x4 = i16x4::new(-32768, 0x00, 0x01, 0x02);
-        let e: u16x4 = u16x4::new(0xFF_FF, 0, 0xFF_FF, 0xFF_FF);
-        let r: u16x4 = transmute(vtst_s16(transmute(a), transmute(b)));
+    unsafe fn test_vcge_s32() {
+        let a: i32x2 = i32x2::new(1, 2);
+        let b: i32x2 = i32x2::new(0, 1);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x2 = transmute(vcge_s32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vtstq_s16() {
-        let a: i16x8 = i16x8::new(-32768, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
-        let b: i16x8 = i16x8::new(-32768, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
-        let e: u16x8 = u16x8::new(0xFF_FF, 0, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
-        let r: u16x8 = transmute(vtstq_s16(transmute(a), transmute(b)));
+    unsafe fn test_vcgeq_s32() {
+        let a: i32x4 = i32x4::new(1, 2, 3, 4);
+        let b: i32x4 = i32x4::new(0, 1, 2, 3);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x4 = transmute(vcgeq_s32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vtst_s32() {
-        let a: i32x2 = i32x2::new(-2147483648, 0x00);
-        let b: i32x2 = i32x2::new(-2147483648, 0x00);
-        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0);
-        let r: u32x2 = transmute(vtst_s32(transmute(a), transmute(b)));
+    unsafe fn test_vcge_u8() {
+        let a: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: u8x8 = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: u8x8 = u8x8::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x8 = transmute(vcge_u8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vtstq_s32() {
-        let a: i32x4 = i32x4::new(-2147483648, 0x00, 0x01, 0x02);
-        let b: i32x4 = i32x4::new(-2147483648, 0x00, 0x01, 0x02);
-        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x4 = transmute(vtstq_s32(transmute(a), transmute(b)));
+    unsafe fn test_vcgeq_u8() {
+        let a: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b: u8x16 = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: u8x16 = u8x16::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x16 = transmute(vcgeq_u8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vtst_p8() {
-        let a: i8x8 = i8x8::new(-128, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
-        let b: i8x8 = i8x8::new(-128, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
-        let e: u8x8 = u8x8::new(0xFF, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-        let r: u8x8 = transmute(vtst_p8(transmute(a), transmute(b)));
+    unsafe fn test_vcge_u16() {
+        let a: u16x4 = u16x4::new(1, 2, 3, 4);
+        let b: u16x4 = u16x4::new(0, 1, 2, 3);
+        let e: u16x4 = u16x4::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x4 = transmute(vcge_u16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vtstq_p8() {
-        let a: i8x16 = i8x16::new(-128, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x7F);
-        let b: i8x16 = i8x16::new(-128, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x7F);
-        let e: u8x16 = u8x16::new(0xFF, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-        let r: u8x16 = transmute(vtstq_p8(transmute(a), transmute(b)));
+    unsafe fn test_vcgeq_u16() {
+        let a: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: u16x8 = u16x8::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x8 = transmute(vcgeq_u16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vtst_u8() {
-        let a: u8x8 = u8x8::new(0, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
-        let b: u8x8 = u8x8::new(0, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
-        let e: u8x8 = u8x8::new(0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-        let r: u8x8 = transmute(vtst_u8(transmute(a), transmute(b)));
+    unsafe fn test_vcge_u32() {
+        let a: u32x2 = u32x2::new(1, 2);
+        let b: u32x2 = u32x2::new(0, 1);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x2 = transmute(vcge_u32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vtstq_u8() {
-        let a: u8x16 = u8x16::new(0, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0xFF);
-        let b: u8x16 = u8x16::new(0, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0xFF);
-        let e: u8x16 = u8x16::new(0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-        let r: u8x16 = transmute(vtstq_u8(transmute(a), transmute(b)));
+    unsafe fn test_vcgeq_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 3, 4);
+        let b: u32x4 = u32x4::new(0, 1, 2, 3);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x4 = transmute(vcgeq_u32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vtst_u16() {
-        let a: u16x4 = u16x4::new(0, 0x00, 0x01, 0x02);
-        let b: u16x4 = u16x4::new(0, 0x00, 0x01, 0x02);
-        let e: u16x4 = u16x4::new(0, 0, 0xFF_FF, 0xFF_FF);
-        let r: u16x4 = transmute(vtst_u16(transmute(a), transmute(b)));
+    unsafe fn test_vcge_f32() {
+        let a: f32x2 = f32x2::new(1.2, 2.3);
+        let b: f32x2 = f32x2::new(0.1, 1.2);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x2 = transmute(vcge_f32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vtstq_u16() {
-        let a: u16x8 = u16x8::new(0, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
-        let b: u16x8 = u16x8::new(0, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
-        let e: u16x8 = u16x8::new(0, 0, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
-        let r: u16x8 = transmute(vtstq_u16(transmute(a), transmute(b)));
+    unsafe fn test_vcgeq_f32() {
+        let a: f32x4 = f32x4::new(1.2, 2.3, 3.4, 4.5);
+        let b: f32x4 = f32x4::new(0.1, 1.2, 2.3, 3.4);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x4 = transmute(vcgeq_f32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vtst_u32() {
-        let a: u32x2 = u32x2::new(0, 0x00);
-        let b: u32x2 = u32x2::new(0, 0x00);
-        let e: u32x2 = u32x2::new(0, 0);
-        let r: u32x2 = transmute(vtst_u32(transmute(a), transmute(b)));
+    unsafe fn test_vcls_s8() {
+        let a: i8x8 = i8x8::new(-128, -1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: i8x8 = i8x8::new(0, 7, 7, 7, 7, 7, 7, 7);
+        let r: i8x8 = transmute(vcls_s8(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vtstq_u32() {
-        let a: u32x4 = u32x4::new(0, 0x00, 0x01, 0x02);
-        let b: u32x4 = u32x4::new(0, 0x00, 0x01, 0x02);
-        let e: u32x4 = u32x4::new(0, 0, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x4 = transmute(vtstq_u32(transmute(a), transmute(b)));
+    unsafe fn test_vclsq_s8() {
+        let a: i8x16 = i8x16::new(-128, -1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7F);
+        let e: i8x16 = i8x16::new(0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0);
+        let r: i8x16 = transmute(vclsq_s8(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vabs_f32() {
-        let a: f32x2 = f32x2::new(-0.1, -2.2);
-        let e: f32x2 = f32x2::new(0.1, 2.2);
-        let r: f32x2 = transmute(vabs_f32(transmute(a)));
+    unsafe fn test_vcls_s16() {
+        let a: i16x4 = i16x4::new(-32768, -1, 0x00, 0x00);
+        let e: i16x4 = i16x4::new(0, 15, 15, 15);
+        let r: i16x4 = transmute(vcls_s16(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vabsq_f32() {
-        let a: f32x4 = f32x4::new(-0.1, -2.2, -3.3, -6.6);
-        let e: f32x4 = f32x4::new(0.1, 2.2, 3.3, 6.6);
-        let r: f32x4 = transmute(vabsq_f32(transmute(a)));
+    unsafe fn test_vclsq_s16() {
+        let a: i16x8 = i16x8::new(-32768, -1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: i16x8 = i16x8::new(0, 15, 15, 15, 15, 15, 15, 15);
+        let r: i16x8 = transmute(vclsq_s16(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcgt_s8() {
-        let a: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let b: i8x8 = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
-        let e: u8x8 = u8x8::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-        let r: u8x8 = transmute(vcgt_s8(transmute(a), transmute(b)));
+    unsafe fn test_vcls_s32() {
+        let a: i32x2 = i32x2::new(-2147483648, -1);
+        let e: i32x2 = i32x2::new(0, 31);
+        let r: i32x2 = transmute(vcls_s32(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcgtq_s8() {
-        let a: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let b: i8x16 = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let e: u8x16 = u8x16::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-        let r: u8x16 = transmute(vcgtq_s8(transmute(a), transmute(b)));
+    unsafe fn test_vclsq_s32() {
+        let a: i32x4 = i32x4::new(-2147483648, -1, 0x00, 0x00);
+        let e: i32x4 = i32x4::new(0, 31, 31, 31);
+        let r: i32x4 = transmute(vclsq_s32(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcgt_s16() {
-        let a: i16x4 = i16x4::new(1, 2, 3, 4);
-        let b: i16x4 = i16x4::new(0, 1, 2, 3);
-        let e: u16x4 = u16x4::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
-        let r: u16x4 = transmute(vcgt_s16(transmute(a), transmute(b)));
+    unsafe fn test_vclz_s8() {
+        let a: i8x8 = i8x8::new(-128, -1, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01);
+        let e: i8x8 = i8x8::new(0, 0, 8, 7, 7, 7, 7, 7);
+        let r: i8x8 = transmute(vclz_s8(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcgtq_s16() {
-        let a: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let b: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
-        let e: u16x8 = u16x8::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
-        let r: u16x8 = transmute(vcgtq_s16(transmute(a), transmute(b)));
+    unsafe fn test_vclzq_s8() {
+        let a: i8x16 = i8x16::new(-128, -1, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x7F);
+        let e: i8x16 = i8x16::new(0, 0, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 1);
+        let r: i8x16 = transmute(vclzq_s8(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcgt_s32() {
-        let a: i32x2 = i32x2::new(1, 2);
-        let b: i32x2 = i32x2::new(0, 1);
-        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x2 = transmute(vcgt_s32(transmute(a), transmute(b)));
+    unsafe fn test_vclz_s16() {
+        let a: i16x4 = i16x4::new(-32768, -1, 0x00, 0x01);
+        let e: i16x4 = i16x4::new(0, 0, 16, 15);
+        let r: i16x4 = transmute(vclz_s16(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcgtq_s32() {
-        let a: i32x4 = i32x4::new(1, 2, 3, 4);
-        let b: i32x4 = i32x4::new(0, 1, 2, 3);
-        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x4 = transmute(vcgtq_s32(transmute(a), transmute(b)));
+    unsafe fn test_vclzq_s16() {
+        let a: i16x8 = i16x8::new(-32768, -1, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01);
+        let e: i16x8 = i16x8::new(0, 0, 16, 15, 15, 15, 15, 15);
+        let r: i16x8 = transmute(vclzq_s16(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcgt_u8() {
-        let a: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let b: u8x8 = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
-        let e: u8x8 = u8x8::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-        let r: u8x8 = transmute(vcgt_u8(transmute(a), transmute(b)));
+    unsafe fn test_vclz_s32() {
+        let a: i32x2 = i32x2::new(-2147483648, -1);
+        let e: i32x2 = i32x2::new(0, 0);
+        let r: i32x2 = transmute(vclz_s32(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcgtq_u8() {
-        let a: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let b: u8x16 = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let e: u8x16 = u8x16::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-        let r: u8x16 = transmute(vcgtq_u8(transmute(a), transmute(b)));
+    unsafe fn test_vclzq_s32() {
+        let a: i32x4 = i32x4::new(-2147483648, -1, 0x00, 0x01);
+        let e: i32x4 = i32x4::new(0, 0, 32, 31);
+        let r: i32x4 = transmute(vclzq_s32(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcgt_u16() {
-        let a: u16x4 = u16x4::new(1, 2, 3, 4);
-        let b: u16x4 = u16x4::new(0, 1, 2, 3);
-        let e: u16x4 = u16x4::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
-        let r: u16x4 = transmute(vcgt_u16(transmute(a), transmute(b)));
+    unsafe fn test_vclz_u8() {
+        let a: u8x8 = u8x8::new(0, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01);
+        let e: u8x8 = u8x8::new(8, 8, 7, 7, 7, 7, 7, 7);
+        let r: u8x8 = transmute(vclz_u8(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcgtq_u16() {
-        let a: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let b: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
-        let e: u16x8 = u16x8::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
-        let r: u16x8 = transmute(vcgtq_u16(transmute(a), transmute(b)));
+    unsafe fn test_vclzq_u8() {
+        let a: u8x16 = u8x16::new(0, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0xFF);
+        let e: u8x16 = u8x16::new(8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0);
+        let r: u8x16 = transmute(vclzq_u8(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcgt_u32() {
-        let a: u32x2 = u32x2::new(1, 2);
-        let b: u32x2 = u32x2::new(0, 1);
-        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x2 = transmute(vcgt_u32(transmute(a), transmute(b)));
+    unsafe fn test_vclz_u16() {
+        let a: u16x4 = u16x4::new(0, 0x00, 0x01, 0x01);
+        let e: u16x4 = u16x4::new(16, 16, 15, 15);
+        let r: u16x4 = transmute(vclz_u16(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcgtq_u32() {
-        let a: u32x4 = u32x4::new(1, 2, 3, 4);
-        let b: u32x4 = u32x4::new(0, 1, 2, 3);
-        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x4 = transmute(vcgtq_u32(transmute(a), transmute(b)));
+    unsafe fn test_vclzq_u16() {
+        let a: u16x8 = u16x8::new(0, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01);
+        let e: u16x8 = u16x8::new(16, 16, 15, 15, 15, 15, 15, 15);
+        let r: u16x8 = transmute(vclzq_u16(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcgt_f32() {
-        let a: f32x2 = f32x2::new(1.2, 2.3);
-        let b: f32x2 = f32x2::new(0.1, 1.2);
-        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x2 = transmute(vcgt_f32(transmute(a), transmute(b)));
+    unsafe fn test_vclz_u32() {
+        let a: u32x2 = u32x2::new(0, 0x00);
+        let e: u32x2 = u32x2::new(32, 32);
+        let r: u32x2 = transmute(vclz_u32(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcgtq_f32() {
-        let a: f32x4 = f32x4::new(1.2, 2.3, 3.4, 4.5);
-        let b: f32x4 = f32x4::new(0.1, 1.2, 2.3, 3.4);
-        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x4 = transmute(vcgtq_f32(transmute(a), transmute(b)));
+    unsafe fn test_vclzq_u32() {
+        let a: u32x4 = u32x4::new(0, 0x00, 0x01, 0x01);
+        let e: u32x4 = u32x4::new(32, 32, 31, 31);
+        let r: u32x4 = transmute(vclzq_u32(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vclt_s8() {
-        let a: i8x8 = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
-        let b: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let e: u8x8 = u8x8::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-        let r: u8x8 = transmute(vclt_s8(transmute(a), transmute(b)));
+    unsafe fn test_vcagt_f32() {
+        let a: f32x2 = f32x2::new(-1.2, 0.0);
+        let b: f32x2 = f32x2::new(-1.1, 0.0);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0);
+        let r: u32x2 = transmute(vcagt_f32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcltq_s8() {
-        let a: i8x16 = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let b: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let e: u8x16 = u8x16::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-        let r: u8x16 = transmute(vcltq_s8(transmute(a), transmute(b)));
+    unsafe fn test_vcagtq_f32() {
+        let a: f32x4 = f32x4::new(-1.2, 0.0, 1.2, 2.3);
+        let b: f32x4 = f32x4::new(-1.1, 0.0, 1.1, 2.4);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0, 0xFF_FF_FF_FF, 0);
+        let r: u32x4 = transmute(vcagtq_f32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vclt_s16() {
-        let a: i16x4 = i16x4::new(0, 1, 2, 3);
-        let b: i16x4 = i16x4::new(1, 2, 3, 4);
-        let e: u16x4 = u16x4::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
-        let r: u16x4 = transmute(vclt_s16(transmute(a), transmute(b)));
+    unsafe fn test_vcage_f32() {
+        let a: f32x2 = f32x2::new(-1.2, 0.0);
+        let b: f32x2 = f32x2::new(-1.1, 0.0);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x2 = transmute(vcage_f32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcltq_s16() {
-        let a: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
-        let b: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let e: u16x8 = u16x8::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
-        let r: u16x8 = transmute(vcltq_s16(transmute(a), transmute(b)));
+    unsafe fn test_vcageq_f32() {
+        let a: f32x4 = f32x4::new(-1.2, 0.0, 1.2, 2.3);
+        let b: f32x4 = f32x4::new(-1.1, 0.0, 1.1, 2.4);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0);
+        let r: u32x4 = transmute(vcageq_f32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vclt_s32() {
-        let a: i32x2 = i32x2::new(0, 1);
-        let b: i32x2 = i32x2::new(1, 2);
-        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x2 = transmute(vclt_s32(transmute(a), transmute(b)));
+    unsafe fn test_vcalt_f32() {
+        let a: f32x2 = f32x2::new(-1.2, 0.0);
+        let b: f32x2 = f32x2::new(-1.1, 0.0);
+        let e: u32x2 = u32x2::new(0, 0);
+        let r: u32x2 = transmute(vcalt_f32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcltq_s32() {
-        let a: i32x4 = i32x4::new(0, 1, 2, 3);
-        let b: i32x4 = i32x4::new(1, 2, 3, 4);
-        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x4 = transmute(vcltq_s32(transmute(a), transmute(b)));
+    unsafe fn test_vcaltq_f32() {
+        let a: f32x4 = f32x4::new(-1.2, 0.0, 1.2, 2.3);
+        let b: f32x4 = f32x4::new(-1.1, 0.0, 1.1, 2.4);
+        let e: u32x4 = u32x4::new(0, 0, 0, 0xFF_FF_FF_FF);
+        let r: u32x4 = transmute(vcaltq_f32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vclt_u8() {
-        let a: u8x8 = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
-        let b: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let e: u8x8 = u8x8::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-        let r: u8x8 = transmute(vclt_u8(transmute(a), transmute(b)));
+    unsafe fn test_vcale_f32() {
+        let a: f32x2 = f32x2::new(-1.2, 0.0);
+        let b: f32x2 = f32x2::new(-1.1, 0.0);
+        let e: u32x2 = u32x2::new(0, 0xFF_FF_FF_FF);
+        let r: u32x2 = transmute(vcale_f32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcltq_u8() {
-        let a: u8x16 = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let b: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let e: u8x16 = u8x16::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-        let r: u8x16 = transmute(vcltq_u8(transmute(a), transmute(b)));
+    unsafe fn test_vcaleq_f32() {
+        let a: f32x4 = f32x4::new(-1.2, 0.0, 1.2, 2.3);
+        let b: f32x4 = f32x4::new(-1.1, 0.0, 1.1, 2.4);
+        let e: u32x4 = u32x4::new(0, 0xFF_FF_FF_FF, 0, 0xFF_FF_FF_FF);
+        let r: u32x4 = transmute(vcaleq_f32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vclt_u16() {
-        let a: u16x4 = u16x4::new(0, 1, 2, 3);
-        let b: u16x4 = u16x4::new(1, 2, 3, 4);
-        let e: u16x4 = u16x4::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
-        let r: u16x4 = transmute(vclt_u16(transmute(a), transmute(b)));
+    unsafe fn test_vcreate_s8() {
+        let a: u64 = 1;
+        let e: i8x8 = i8x8::new(1, 0, 0, 0, 0, 0, 0, 0);
+        let r: i8x8 = transmute(vcreate_s8(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcltq_u16() {
-        let a: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
-        let b: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let e: u16x8 = u16x8::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
-        let r: u16x8 = transmute(vcltq_u16(transmute(a), transmute(b)));
+    unsafe fn test_vcreate_s32() {
+        let a: u64 = 1;
+        let e: i32x2 = i32x2::new(1, 0);
+        let r: i32x2 = transmute(vcreate_s32(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vclt_u32() {
-        let a: u32x2 = u32x2::new(0, 1);
-        let b: u32x2 = u32x2::new(1, 2);
-        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x2 = transmute(vclt_u32(transmute(a), transmute(b)));
+    unsafe fn test_vcreate_s64() {
+        let a: u64 = 1;
+        let e: i64x1 = i64x1::new(1);
+        let r: i64x1 = transmute(vcreate_s64(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcltq_u32() {
-        let a: u32x4 = u32x4::new(0, 1, 2, 3);
-        let b: u32x4 = u32x4::new(1, 2, 3, 4);
-        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x4 = transmute(vcltq_u32(transmute(a), transmute(b)));
+    unsafe fn test_vcreate_u8() {
+        let a: u64 = 1;
+        let e: u8x8 = u8x8::new(1, 0, 0, 0, 0, 0, 0, 0);
+        let r: u8x8 = transmute(vcreate_u8(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vclt_f32() {
-        let a: f32x2 = f32x2::new(0.1, 1.2);
-        let b: f32x2 = f32x2::new(1.2, 2.3);
-        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x2 = transmute(vclt_f32(transmute(a), transmute(b)));
+    unsafe fn test_vcreate_u32() {
+        let a: u64 = 1;
+        let e: u32x2 = u32x2::new(1, 0);
+        let r: u32x2 = transmute(vcreate_u32(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcltq_f32() {
-        let a: f32x4 = f32x4::new(0.1, 1.2, 2.3, 3.4);
-        let b: f32x4 = f32x4::new(1.2, 2.3, 3.4, 4.5);
-        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x4 = transmute(vcltq_f32(transmute(a), transmute(b)));
+    unsafe fn test_vcreate_u64() {
+        let a: u64 = 1;
+        let e: u64x1 = u64x1::new(1);
+        let r: u64x1 = transmute(vcreate_u64(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcle_s8() {
-        let a: i8x8 = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
-        let b: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let e: u8x8 = u8x8::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-        let r: u8x8 = transmute(vcle_s8(transmute(a), transmute(b)));
+    unsafe fn test_vcreate_p8() {
+        let a: u64 = 1;
+        let e: i8x8 = i8x8::new(1, 0, 0, 0, 0, 0, 0, 0);
+        let r: i8x8 = transmute(vcreate_p8(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcleq_s8() {
-        let a: i8x16 = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let b: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let e: u8x16 = u8x16::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-        let r: u8x16 = transmute(vcleq_s8(transmute(a), transmute(b)));
+    unsafe fn test_vcreate_p16() {
+        let a: u64 = 1;
+        let e: i16x4 = i16x4::new(1, 0, 0, 0);
+        let r: i16x4 = transmute(vcreate_p16(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcle_s16() {
-        let a: i16x4 = i16x4::new(0, 1, 2, 3);
-        let b: i16x4 = i16x4::new(1, 2, 3, 4);
-        let e: u16x4 = u16x4::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
-        let r: u16x4 = transmute(vcle_s16(transmute(a), transmute(b)));
+    unsafe fn test_vcreate_p64() {
+        let a: u64 = 1;
+        let e: i64x1 = i64x1::new(1);
+        let r: i64x1 = transmute(vcreate_p64(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcleq_s16() {
-        let a: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
-        let b: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let e: u16x8 = u16x8::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
-        let r: u16x8 = transmute(vcleq_s16(transmute(a), transmute(b)));
+    unsafe fn test_vcreate_f32() {
+        let a: u64 = 0;
+        let e: f32x2 = f32x2::new(0., 0.);
+        let r: f32x2 = transmute(vcreate_f32(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcle_s32() {
-        let a: i32x2 = i32x2::new(0, 1);
-        let b: i32x2 = i32x2::new(1, 2);
-        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x2 = transmute(vcle_s32(transmute(a), transmute(b)));
+    unsafe fn test_vcvt_f32_s32() {
+        let a: i32x2 = i32x2::new(1, 2);
+        let e: f32x2 = f32x2::new(1., 2.);
+        let r: f32x2 = transmute(vcvt_f32_s32(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcleq_s32() {
-        let a: i32x4 = i32x4::new(0, 1, 2, 3);
-        let b: i32x4 = i32x4::new(1, 2, 3, 4);
-        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x4 = transmute(vcleq_s32(transmute(a), transmute(b)));
+    unsafe fn test_vcvtq_f32_s32() {
+        let a: i32x4 = i32x4::new(1, 2, 3, 4);
+        let e: f32x4 = f32x4::new(1., 2., 3., 4.);
+        let r: f32x4 = transmute(vcvtq_f32_s32(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcle_u8() {
-        let a: u8x8 = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
-        let b: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let e: u8x8 = u8x8::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-        let r: u8x8 = transmute(vcle_u8(transmute(a), transmute(b)));
+    unsafe fn test_vcvt_f32_u32() {
+        let a: u32x2 = u32x2::new(1, 2);
+        let e: f32x2 = f32x2::new(1., 2.);
+        let r: f32x2 = transmute(vcvt_f32_u32(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcleq_u8() {
-        let a: u8x16 = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let b: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let e: u8x16 = u8x16::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-        let r: u8x16 = transmute(vcleq_u8(transmute(a), transmute(b)));
+    unsafe fn test_vcvtq_f32_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 3, 4);
+        let e: f32x4 = f32x4::new(1., 2., 3., 4.);
+        let r: f32x4 = transmute(vcvtq_f32_u32(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcle_u16() {
-        let a: u16x4 = u16x4::new(0, 1, 2, 3);
-        let b: u16x4 = u16x4::new(1, 2, 3, 4);
-        let e: u16x4 = u16x4::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
-        let r: u16x4 = transmute(vcle_u16(transmute(a), transmute(b)));
+    unsafe fn test_vcvt_n_f32_s32() {
+        let a: i32x2 = i32x2::new(1, 2);
+        let e: f32x2 = f32x2::new(0.25, 0.5);
+        let r: f32x2 = transmute(vcvt_n_f32_s32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtq_n_f32_s32() {
+        let a: i32x4 = i32x4::new(1, 2, 3, 4);
+        let e: f32x4 = f32x4::new(0.25, 0.5, 0.75, 1.);
+        let r: f32x4 = transmute(vcvtq_n_f32_s32::<2>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcleq_u16() {
-        let a: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
-        let b: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let e: u16x8 = u16x8::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
-        let r: u16x8 = transmute(vcleq_u16(transmute(a), transmute(b)));
+    unsafe fn test_vcvt_n_f32_u32() {
+        let a: u32x2 = u32x2::new(1, 2);
+        let e: f32x2 = f32x2::new(0.25, 0.5);
+        let r: f32x2 = transmute(vcvt_n_f32_u32::<2>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcle_u32() {
-        let a: u32x2 = u32x2::new(0, 1);
-        let b: u32x2 = u32x2::new(1, 2);
-        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x2 = transmute(vcle_u32(transmute(a), transmute(b)));
+    unsafe fn test_vcvtq_n_f32_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 3, 4);
+        let e: f32x4 = f32x4::new(0.25, 0.5, 0.75, 1.);
+        let r: f32x4 = transmute(vcvtq_n_f32_u32::<2>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcleq_u32() {
-        let a: u32x4 = u32x4::new(0, 1, 2, 3);
-        let b: u32x4 = u32x4::new(1, 2, 3, 4);
-        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x4 = transmute(vcleq_u32(transmute(a), transmute(b)));
+    unsafe fn test_vcvt_n_s32_f32() {
+        let a: f32x2 = f32x2::new(0.25, 0.5);
+        let e: i32x2 = i32x2::new(1, 2);
+        let r: i32x2 = transmute(vcvt_n_s32_f32::<2>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcle_f32() {
-        let a: f32x2 = f32x2::new(0.1, 1.2);
-        let b: f32x2 = f32x2::new(1.2, 2.3);
-        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x2 = transmute(vcle_f32(transmute(a), transmute(b)));
+    unsafe fn test_vcvtq_n_s32_f32() {
+        let a: f32x4 = f32x4::new(0.25, 0.5, 0.75, 1.);
+        let e: i32x4 = i32x4::new(1, 2, 3, 4);
+        let r: i32x4 = transmute(vcvtq_n_s32_f32::<2>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcleq_f32() {
-        let a: f32x4 = f32x4::new(0.1, 1.2, 2.3, 3.4);
-        let b: f32x4 = f32x4::new(1.2, 2.3, 3.4, 4.5);
-        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x4 = transmute(vcleq_f32(transmute(a), transmute(b)));
+    unsafe fn test_vcvt_n_u32_f32() {
+        let a: f32x2 = f32x2::new(0.25, 0.5);
+        let e: u32x2 = u32x2::new(1, 2);
+        let r: u32x2 = transmute(vcvt_n_u32_f32::<2>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcge_s8() {
-        let a: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let b: i8x8 = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
-        let e: u8x8 = u8x8::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-        let r: u8x8 = transmute(vcge_s8(transmute(a), transmute(b)));
+    unsafe fn test_vcvtq_n_u32_f32() {
+        let a: f32x4 = f32x4::new(0.25, 0.5, 0.75, 1.);
+        let e: u32x4 = u32x4::new(1, 2, 3, 4);
+        let r: u32x4 = transmute(vcvtq_n_u32_f32::<2>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcgeq_s8() {
-        let a: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let b: i8x16 = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let e: u8x16 = u8x16::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-        let r: u8x16 = transmute(vcgeq_s8(transmute(a), transmute(b)));
+    unsafe fn test_vcvt_s32_f32() {
+        let a: f32x2 = f32x2::new(-1.1, 2.1);
+        let e: i32x2 = i32x2::new(-1, 2);
+        let r: i32x2 = transmute(vcvt_s32_f32(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcge_s16() {
-        let a: i16x4 = i16x4::new(1, 2, 3, 4);
-        let b: i16x4 = i16x4::new(0, 1, 2, 3);
-        let e: u16x4 = u16x4::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
-        let r: u16x4 = transmute(vcge_s16(transmute(a), transmute(b)));
+    unsafe fn test_vcvtq_s32_f32() {
+        let a: f32x4 = f32x4::new(-1.1, 2.1, -2.9, 3.9);
+        let e: i32x4 = i32x4::new(-1, 2, -2, 3);
+        let r: i32x4 = transmute(vcvtq_s32_f32(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcgeq_s16() {
-        let a: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let b: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
-        let e: u16x8 = u16x8::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
-        let r: u16x8 = transmute(vcgeq_s16(transmute(a), transmute(b)));
+    unsafe fn test_vcvt_u32_f32() {
+        let a: f32x2 = f32x2::new(1.1, 2.1);
+        let e: u32x2 = u32x2::new(1, 2);
+        let r: u32x2 = transmute(vcvt_u32_f32(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcge_s32() {
-        let a: i32x2 = i32x2::new(1, 2);
-        let b: i32x2 = i32x2::new(0, 1);
-        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x2 = transmute(vcge_s32(transmute(a), transmute(b)));
+    unsafe fn test_vcvtq_u32_f32() {
+        let a: f32x4 = f32x4::new(1.1, 2.1, 2.9, 3.9);
+        let e: u32x4 = u32x4::new(1, 2, 2, 3);
+        let r: u32x4 = transmute(vcvtq_u32_f32(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcgeq_s32() {
-        let a: i32x4 = i32x4::new(1, 2, 3, 4);
-        let b: i32x4 = i32x4::new(0, 1, 2, 3);
-        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x4 = transmute(vcgeq_s32(transmute(a), transmute(b)));
+    unsafe fn test_vdup_lane_s8() {
+        let a: i8x8 = i8x8::new(1, 1, 1, 4, 1, 6, 7, 8);
+        let e: i8x8 = i8x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let r: i8x8 = transmute(vdup_lane_s8::<4>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcge_u8() {
-        let a: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let b: u8x8 = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
-        let e: u8x8 = u8x8::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-        let r: u8x8 = transmute(vcge_u8(transmute(a), transmute(b)));
+    unsafe fn test_vdupq_laneq_s8() {
+        let a: i8x16 = i8x16::new(1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16);
+        let e: i8x16 = i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let r: i8x16 = transmute(vdupq_laneq_s8::<8>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcgeq_u8() {
-        let a: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let b: u8x16 = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let e: u8x16 = u8x16::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-        let r: u8x16 = transmute(vcgeq_u8(transmute(a), transmute(b)));
+    unsafe fn test_vdup_lane_s16() {
+        let a: i16x4 = i16x4::new(1, 1, 1, 4);
+        let e: i16x4 = i16x4::new(1, 1, 1, 1);
+        let r: i16x4 = transmute(vdup_lane_s16::<2>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcge_u16() {
-        let a: u16x4 = u16x4::new(1, 2, 3, 4);
-        let b: u16x4 = u16x4::new(0, 1, 2, 3);
-        let e: u16x4 = u16x4::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
-        let r: u16x4 = transmute(vcge_u16(transmute(a), transmute(b)));
+    unsafe fn test_vdupq_laneq_s16() {
+        let a: i16x8 = i16x8::new(1, 1, 1, 4, 1, 6, 7, 8);
+        let e: i16x8 = i16x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let r: i16x8 = transmute(vdupq_laneq_s16::<4>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcgeq_u16() {
-        let a: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let b: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
-        let e: u16x8 = u16x8::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
-        let r: u16x8 = transmute(vcgeq_u16(transmute(a), transmute(b)));
+    unsafe fn test_vdup_lane_s32() {
+        let a: i32x2 = i32x2::new(1, 1);
+        let e: i32x2 = i32x2::new(1, 1);
+        let r: i32x2 = transmute(vdup_lane_s32::<1>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcge_u32() {
-        let a: u32x2 = u32x2::new(1, 2);
-        let b: u32x2 = u32x2::new(0, 1);
-        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x2 = transmute(vcge_u32(transmute(a), transmute(b)));
+    unsafe fn test_vdupq_laneq_s32() {
+        let a: i32x4 = i32x4::new(1, 1, 1, 4);
+        let e: i32x4 = i32x4::new(1, 1, 1, 1);
+        let r: i32x4 = transmute(vdupq_laneq_s32::<2>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcgeq_u32() {
-        let a: u32x4 = u32x4::new(1, 2, 3, 4);
-        let b: u32x4 = u32x4::new(0, 1, 2, 3);
-        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x4 = transmute(vcgeq_u32(transmute(a), transmute(b)));
+    unsafe fn test_vdup_laneq_s8() {
+        let a: i8x16 = i8x16::new(1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16);
+        let e: i8x8 = i8x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let r: i8x8 = transmute(vdup_laneq_s8::<8>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcge_f32() {
-        let a: f32x2 = f32x2::new(1.2, 2.3);
-        let b: f32x2 = f32x2::new(0.1, 1.2);
-        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x2 = transmute(vcge_f32(transmute(a), transmute(b)));
+    unsafe fn test_vdup_laneq_s16() {
+        let a: i16x8 = i16x8::new(1, 1, 1, 4, 1, 6, 7, 8);
+        let e: i16x4 = i16x4::new(1, 1, 1, 1);
+        let r: i16x4 = transmute(vdup_laneq_s16::<4>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcgeq_f32() {
-        let a: f32x4 = f32x4::new(1.2, 2.3, 3.4, 4.5);
-        let b: f32x4 = f32x4::new(0.1, 1.2, 2.3, 3.4);
-        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x4 = transmute(vcgeq_f32(transmute(a), transmute(b)));
+    unsafe fn test_vdup_laneq_s32() {
+        let a: i32x4 = i32x4::new(1, 1, 1, 4);
+        let e: i32x2 = i32x2::new(1, 1);
+        let r: i32x2 = transmute(vdup_laneq_s32::<2>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcls_s8() {
-        let a: i8x8 = i8x8::new(-128, -1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
-        let e: i8x8 = i8x8::new(0, 7, 7, 7, 7, 7, 7, 7);
-        let r: i8x8 = transmute(vcls_s8(transmute(a)));
+    unsafe fn test_vdupq_lane_s8() {
+        let a: i8x8 = i8x8::new(1, 1, 1, 4, 1, 6, 7, 8);
+        let e: i8x16 = i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let r: i8x16 = transmute(vdupq_lane_s8::<4>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vclsq_s8() {
-        let a: i8x16 = i8x16::new(-128, -1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7F);
-        let e: i8x16 = i8x16::new(0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0);
-        let r: i8x16 = transmute(vclsq_s8(transmute(a)));
+    unsafe fn test_vdupq_lane_s16() {
+        let a: i16x4 = i16x4::new(1, 1, 1, 4);
+        let e: i16x8 = i16x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let r: i16x8 = transmute(vdupq_lane_s16::<2>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcls_s16() {
-        let a: i16x4 = i16x4::new(-32768, -1, 0x00, 0x00);
-        let e: i16x4 = i16x4::new(0, 15, 15, 15);
-        let r: i16x4 = transmute(vcls_s16(transmute(a)));
+    unsafe fn test_vdupq_lane_s32() {
+        let a: i32x2 = i32x2::new(1, 1);
+        let e: i32x4 = i32x4::new(1, 1, 1, 1);
+        let r: i32x4 = transmute(vdupq_lane_s32::<1>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vclsq_s16() {
-        let a: i16x8 = i16x8::new(-32768, -1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
-        let e: i16x8 = i16x8::new(0, 15, 15, 15, 15, 15, 15, 15);
-        let r: i16x8 = transmute(vclsq_s16(transmute(a)));
+    unsafe fn test_vdup_lane_u8() {
+        let a: u8x8 = u8x8::new(1, 1, 1, 4, 1, 6, 7, 8);
+        let e: u8x8 = u8x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let r: u8x8 = transmute(vdup_lane_u8::<4>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcls_s32() {
-        let a: i32x2 = i32x2::new(-2147483648, -1);
-        let e: i32x2 = i32x2::new(0, 31);
-        let r: i32x2 = transmute(vcls_s32(transmute(a)));
+    unsafe fn test_vdupq_laneq_u8() {
+        let a: u8x16 = u8x16::new(1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16);
+        let e: u8x16 = u8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let r: u8x16 = transmute(vdupq_laneq_u8::<8>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vclsq_s32() {
-        let a: i32x4 = i32x4::new(-2147483648, -1, 0x00, 0x00);
-        let e: i32x4 = i32x4::new(0, 31, 31, 31);
-        let r: i32x4 = transmute(vclsq_s32(transmute(a)));
+    unsafe fn test_vdup_lane_u16() {
+        let a: u16x4 = u16x4::new(1, 1, 1, 4);
+        let e: u16x4 = u16x4::new(1, 1, 1, 1);
+        let r: u16x4 = transmute(vdup_lane_u16::<2>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vclz_s8() {
-        let a: i8x8 = i8x8::new(-128, -1, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01);
-        let e: i8x8 = i8x8::new(0, 0, 8, 7, 7, 7, 7, 7);
-        let r: i8x8 = transmute(vclz_s8(transmute(a)));
+    unsafe fn test_vdupq_laneq_u16() {
+        let a: u16x8 = u16x8::new(1, 1, 1, 4, 1, 6, 7, 8);
+        let e: u16x8 = u16x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let r: u16x8 = transmute(vdupq_laneq_u16::<4>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vclzq_s8() {
-        let a: i8x16 = i8x16::new(-128, -1, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x7F);
-        let e: i8x16 = i8x16::new(0, 0, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 1);
-        let r: i8x16 = transmute(vclzq_s8(transmute(a)));
+    unsafe fn test_vdup_lane_u32() {
+        let a: u32x2 = u32x2::new(1, 1);
+        let e: u32x2 = u32x2::new(1, 1);
+        let r: u32x2 = transmute(vdup_lane_u32::<1>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vclz_s16() {
-        let a: i16x4 = i16x4::new(-32768, -1, 0x00, 0x01);
-        let e: i16x4 = i16x4::new(0, 0, 16, 15);
-        let r: i16x4 = transmute(vclz_s16(transmute(a)));
+    unsafe fn test_vdupq_laneq_u32() {
+        let a: u32x4 = u32x4::new(1, 1, 1, 4);
+        let e: u32x4 = u32x4::new(1, 1, 1, 1);
+        let r: u32x4 = transmute(vdupq_laneq_u32::<2>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vclzq_s16() {
-        let a: i16x8 = i16x8::new(-32768, -1, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01);
-        let e: i16x8 = i16x8::new(0, 0, 16, 15, 15, 15, 15, 15);
-        let r: i16x8 = transmute(vclzq_s16(transmute(a)));
+    unsafe fn test_vdup_laneq_u8() {
+        let a: u8x16 = u8x16::new(1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16);
+        let e: u8x8 = u8x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let r: u8x8 = transmute(vdup_laneq_u8::<8>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdup_laneq_u16() {
+        let a: u16x8 = u16x8::new(1, 1, 1, 4, 1, 6, 7, 8);
+        let e: u16x4 = u16x4::new(1, 1, 1, 1);
+        let r: u16x4 = transmute(vdup_laneq_u16::<4>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vclz_s32() {
-        let a: i32x2 = i32x2::new(-2147483648, -1);
-        let e: i32x2 = i32x2::new(0, 0);
-        let r: i32x2 = transmute(vclz_s32(transmute(a)));
+    unsafe fn test_vdup_laneq_u32() {
+        let a: u32x4 = u32x4::new(1, 1, 1, 4);
+        let e: u32x2 = u32x2::new(1, 1);
+        let r: u32x2 = transmute(vdup_laneq_u32::<2>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vclzq_s32() {
-        let a: i32x4 = i32x4::new(-2147483648, -1, 0x00, 0x01);
-        let e: i32x4 = i32x4::new(0, 0, 32, 31);
-        let r: i32x4 = transmute(vclzq_s32(transmute(a)));
+    unsafe fn test_vdupq_lane_u8() {
+        let a: u8x8 = u8x8::new(1, 1, 1, 4, 1, 6, 7, 8);
+        let e: u8x16 = u8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let r: u8x16 = transmute(vdupq_lane_u8::<4>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vclz_u8() {
-        let a: u8x8 = u8x8::new(0, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01);
-        let e: u8x8 = u8x8::new(8, 8, 7, 7, 7, 7, 7, 7);
-        let r: u8x8 = transmute(vclz_u8(transmute(a)));
+    unsafe fn test_vdupq_lane_u16() {
+        let a: u16x4 = u16x4::new(1, 1, 1, 4);
+        let e: u16x8 = u16x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let r: u16x8 = transmute(vdupq_lane_u16::<2>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vclzq_u8() {
-        let a: u8x16 = u8x16::new(0, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0xFF);
-        let e: u8x16 = u8x16::new(8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0);
-        let r: u8x16 = transmute(vclzq_u8(transmute(a)));
+    unsafe fn test_vdupq_lane_u32() {
+        let a: u32x2 = u32x2::new(1, 1);
+        let e: u32x4 = u32x4::new(1, 1, 1, 1);
+        let r: u32x4 = transmute(vdupq_lane_u32::<1>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vclz_u16() {
-        let a: u16x4 = u16x4::new(0, 0x00, 0x01, 0x01);
-        let e: u16x4 = u16x4::new(16, 16, 15, 15);
-        let r: u16x4 = transmute(vclz_u16(transmute(a)));
+    unsafe fn test_vdup_lane_p8() {
+        let a: i8x8 = i8x8::new(1, 1, 1, 4, 1, 6, 7, 8);
+        let e: i8x8 = i8x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let r: i8x8 = transmute(vdup_lane_p8::<4>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vclzq_u16() {
-        let a: u16x8 = u16x8::new(0, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01);
-        let e: u16x8 = u16x8::new(16, 16, 15, 15, 15, 15, 15, 15);
-        let r: u16x8 = transmute(vclzq_u16(transmute(a)));
+    unsafe fn test_vdupq_laneq_p8() {
+        let a: i8x16 = i8x16::new(1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16);
+        let e: i8x16 = i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let r: i8x16 = transmute(vdupq_laneq_p8::<8>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vclz_u32() {
-        let a: u32x2 = u32x2::new(0, 0x00);
-        let e: u32x2 = u32x2::new(32, 32);
-        let r: u32x2 = transmute(vclz_u32(transmute(a)));
+    unsafe fn test_vdup_lane_p16() {
+        let a: i16x4 = i16x4::new(1, 1, 1, 4);
+        let e: i16x4 = i16x4::new(1, 1, 1, 1);
+        let r: i16x4 = transmute(vdup_lane_p16::<2>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vclzq_u32() {
-        let a: u32x4 = u32x4::new(0, 0x00, 0x01, 0x01);
-        let e: u32x4 = u32x4::new(32, 32, 31, 31);
-        let r: u32x4 = transmute(vclzq_u32(transmute(a)));
+    unsafe fn test_vdupq_laneq_p16() {
+        let a: i16x8 = i16x8::new(1, 1, 1, 4, 1, 6, 7, 8);
+        let e: i16x8 = i16x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let r: i16x8 = transmute(vdupq_laneq_p16::<4>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcagt_f32() {
-        let a: f32x2 = f32x2::new(-1.2, 0.0);
-        let b: f32x2 = f32x2::new(-1.1, 0.0);
-        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0);
-        let r: u32x2 = transmute(vcagt_f32(transmute(a), transmute(b)));
+    unsafe fn test_vdup_laneq_p8() {
+        let a: i8x16 = i8x16::new(1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16);
+        let e: i8x8 = i8x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let r: i8x8 = transmute(vdup_laneq_p8::<8>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcagtq_f32() {
-        let a: f32x4 = f32x4::new(-1.2, 0.0, 1.2, 2.3);
-        let b: f32x4 = f32x4::new(-1.1, 0.0, 1.1, 2.4);
-        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0, 0xFF_FF_FF_FF, 0);
-        let r: u32x4 = transmute(vcagtq_f32(transmute(a), transmute(b)));
+    unsafe fn test_vdup_laneq_p16() {
+        let a: i16x8 = i16x8::new(1, 1, 1, 4, 1, 6, 7, 8);
+        let e: i16x4 = i16x4::new(1, 1, 1, 1);
+        let r: i16x4 = transmute(vdup_laneq_p16::<4>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcage_f32() {
-        let a: f32x2 = f32x2::new(-1.2, 0.0);
-        let b: f32x2 = f32x2::new(-1.1, 0.0);
-        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x2 = transmute(vcage_f32(transmute(a), transmute(b)));
+    unsafe fn test_vdupq_lane_p8() {
+        let a: i8x8 = i8x8::new(1, 1, 1, 4, 1, 6, 7, 8);
+        let e: i8x16 = i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let r: i8x16 = transmute(vdupq_lane_p8::<4>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcageq_f32() {
-        let a: f32x4 = f32x4::new(-1.2, 0.0, 1.2, 2.3);
-        let b: f32x4 = f32x4::new(-1.1, 0.0, 1.1, 2.4);
-        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0);
-        let r: u32x4 = transmute(vcageq_f32(transmute(a), transmute(b)));
+    unsafe fn test_vdupq_lane_p16() {
+        let a: i16x4 = i16x4::new(1, 1, 1, 4);
+        let e: i16x8 = i16x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let r: i16x8 = transmute(vdupq_lane_p16::<2>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcalt_f32() {
-        let a: f32x2 = f32x2::new(-1.2, 0.0);
-        let b: f32x2 = f32x2::new(-1.1, 0.0);
-        let e: u32x2 = u32x2::new(0, 0);
-        let r: u32x2 = transmute(vcalt_f32(transmute(a), transmute(b)));
+    unsafe fn test_vdupq_laneq_s64() {
+        let a: i64x2 = i64x2::new(1, 1);
+        let e: i64x2 = i64x2::new(1, 1);
+        let r: i64x2 = transmute(vdupq_laneq_s64::<1>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcaltq_f32() {
-        let a: f32x4 = f32x4::new(-1.2, 0.0, 1.2, 2.3);
-        let b: f32x4 = f32x4::new(-1.1, 0.0, 1.1, 2.4);
-        let e: u32x4 = u32x4::new(0, 0, 0, 0xFF_FF_FF_FF);
-        let r: u32x4 = transmute(vcaltq_f32(transmute(a), transmute(b)));
+    unsafe fn test_vdupq_lane_s64() {
+        let a: i64x1 = i64x1::new(1);
+        let e: i64x2 = i64x2::new(1, 1);
+        let r: i64x2 = transmute(vdupq_lane_s64::<0>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcale_f32() {
-        let a: f32x2 = f32x2::new(-1.2, 0.0);
-        let b: f32x2 = f32x2::new(-1.1, 0.0);
-        let e: u32x2 = u32x2::new(0, 0xFF_FF_FF_FF);
-        let r: u32x2 = transmute(vcale_f32(transmute(a), transmute(b)));
+    unsafe fn test_vdupq_laneq_u64() {
+        let a: u64x2 = u64x2::new(1, 1);
+        let e: u64x2 = u64x2::new(1, 1);
+        let r: u64x2 = transmute(vdupq_laneq_u64::<1>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcaleq_f32() {
-        let a: f32x4 = f32x4::new(-1.2, 0.0, 1.2, 2.3);
-        let b: f32x4 = f32x4::new(-1.1, 0.0, 1.1, 2.4);
-        let e: u32x4 = u32x4::new(0, 0xFF_FF_FF_FF, 0, 0xFF_FF_FF_FF);
-        let r: u32x4 = transmute(vcaleq_f32(transmute(a), transmute(b)));
+    unsafe fn test_vdupq_lane_u64() {
+        let a: u64x1 = u64x1::new(1);
+        let e: u64x2 = u64x2::new(1, 1);
+        let r: u64x2 = transmute(vdupq_lane_u64::<0>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcreate_s8() {
-        let a: u64 = 1;
-        let e: i8x8 = i8x8::new(1, 0, 0, 0, 0, 0, 0, 0);
-        let r: i8x8 = transmute(vcreate_s8(transmute(a)));
+    unsafe fn test_vdup_lane_f32() {
+        let a: f32x2 = f32x2::new(1., 1.);
+        let e: f32x2 = f32x2::new(1., 1.);
+        let r: f32x2 = transmute(vdup_lane_f32::<1>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcreate_s32() {
-        let a: u64 = 1;
-        let e: i32x2 = i32x2::new(1, 0);
-        let r: i32x2 = transmute(vcreate_s32(transmute(a)));
+    unsafe fn test_vdupq_laneq_f32() {
+        let a: f32x4 = f32x4::new(1., 1., 1., 4.);
+        let e: f32x4 = f32x4::new(1., 1., 1., 1.);
+        let r: f32x4 = transmute(vdupq_laneq_f32::<2>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcreate_s64() {
-        let a: u64 = 1;
-        let e: i64x1 = i64x1::new(1);
-        let r: i64x1 = transmute(vcreate_s64(transmute(a)));
+    unsafe fn test_vdup_laneq_f32() {
+        let a: f32x4 = f32x4::new(1., 1., 1., 4.);
+        let e: f32x2 = f32x2::new(1., 1.);
+        let r: f32x2 = transmute(vdup_laneq_f32::<2>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcreate_u8() {
-        let a: u64 = 1;
-        let e: u8x8 = u8x8::new(1, 0, 0, 0, 0, 0, 0, 0);
-        let r: u8x8 = transmute(vcreate_u8(transmute(a)));
+    unsafe fn test_vdupq_lane_f32() {
+        let a: f32x2 = f32x2::new(1., 1.);
+        let e: f32x4 = f32x4::new(1., 1., 1., 1.);
+        let r: f32x4 = transmute(vdupq_lane_f32::<1>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcreate_u32() {
-        let a: u64 = 1;
-        let e: u32x2 = u32x2::new(1, 0);
-        let r: u32x2 = transmute(vcreate_u32(transmute(a)));
+    unsafe fn test_vdup_lane_s64() {
+        let a: i64x1 = i64x1::new(0);
+        let e: i64x1 = i64x1::new(0);
+        let r: i64x1 = transmute(vdup_lane_s64::<0>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcreate_u64() {
-        let a: u64 = 1;
-        let e: u64x1 = u64x1::new(1);
-        let r: u64x1 = transmute(vcreate_u64(transmute(a)));
+    unsafe fn test_vdup_lane_u64() {
+        let a: u64x1 = u64x1::new(0);
+        let e: u64x1 = u64x1::new(0);
+        let r: u64x1 = transmute(vdup_lane_u64::<0>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcreate_p8() {
-        let a: u64 = 1;
-        let e: i8x8 = i8x8::new(1, 0, 0, 0, 0, 0, 0, 0);
-        let r: i8x8 = transmute(vcreate_p8(transmute(a)));
+    unsafe fn test_vdup_laneq_s64() {
+        let a: i64x2 = i64x2::new(0, 1);
+        let e: i64x1 = i64x1::new(1);
+        let r: i64x1 = transmute(vdup_laneq_s64::<1>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcreate_p16() {
-        let a: u64 = 1;
-        let e: i16x4 = i16x4::new(1, 0, 0, 0);
-        let r: i16x4 = transmute(vcreate_p16(transmute(a)));
+    unsafe fn test_vdup_laneq_u64() {
+        let a: u64x2 = u64x2::new(0, 1);
+        let e: u64x1 = u64x1::new(1);
+        let r: u64x1 = transmute(vdup_laneq_u64::<1>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcreate_p64() {
-        let a: u64 = 1;
-        let e: i64x1 = i64x1::new(1);
-        let r: i64x1 = transmute(vcreate_p64(transmute(a)));
+    unsafe fn test_vext_s8() {
+        let a: i8x8 = i8x8::new(0, 8, 8, 9, 8, 9, 9, 11);
+        let b: i8x8 = i8x8::new(9, 11, 14, 15, 16, 17, 18, 19);
+        let e: i8x8 = i8x8::new(8, 9, 9, 11, 9, 11, 14, 15);
+        let r: i8x8 = transmute(vext_s8::<4>(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcreate_f32() {
-        let a: u64 = 0;
-        let e: f32x2 = f32x2::new(0., 0.);
-        let r: f32x2 = transmute(vcreate_f32(transmute(a)));
+    unsafe fn test_vextq_s8() {
+        let a: i8x16 = i8x16::new(0, 8, 8, 9, 8, 9, 9, 11, 8, 9, 9, 11, 9, 11, 14, 15);
+        let b: i8x16 = i8x16::new(9, 11, 14, 15, 16, 17, 18, 19, 0, 8, 8, 9, 8, 9, 9, 11);
+        let e: i8x16 = i8x16::new(8, 9, 9, 11, 9, 11, 14, 15, 9, 11, 14, 15, 16, 17, 18, 19);
+        let r: i8x16 = transmute(vextq_s8::<8>(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcvt_f32_s32() {
-        let a: i32x2 = i32x2::new(1, 2);
-        let e: f32x2 = f32x2::new(1., 2.);
-        let r: f32x2 = transmute(vcvt_f32_s32(transmute(a)));
+    unsafe fn test_vext_s16() {
+        let a: i16x4 = i16x4::new(0, 8, 8, 9);
+        let b: i16x4 = i16x4::new(9, 11, 14, 15);
+        let e: i16x4 = i16x4::new(8, 9, 9, 11);
+        let r: i16x4 = transmute(vext_s16::<2>(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcvtq_f32_s32() {
-        let a: i32x4 = i32x4::new(1, 2, 3, 4);
-        let e: f32x4 = f32x4::new(1., 2., 3., 4.);
-        let r: f32x4 = transmute(vcvtq_f32_s32(transmute(a)));
+    unsafe fn test_vextq_s16() {
+        let a: i16x8 = i16x8::new(0, 8, 8, 9, 8, 9, 9, 11);
+        let b: i16x8 = i16x8::new(9, 11, 14, 15, 16, 17, 18, 19);
+        let e: i16x8 = i16x8::new(8, 9, 9, 11, 9, 11, 14, 15);
+        let r: i16x8 = transmute(vextq_s16::<4>(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcvt_f32_u32() {
-        let a: u32x2 = u32x2::new(1, 2);
-        let e: f32x2 = f32x2::new(1., 2.);
-        let r: f32x2 = transmute(vcvt_f32_u32(transmute(a)));
+    unsafe fn test_vext_s32() {
+        let a: i32x2 = i32x2::new(0, 8);
+        let b: i32x2 = i32x2::new(9, 11);
+        let e: i32x2 = i32x2::new(8, 9);
+        let r: i32x2 = transmute(vext_s32::<1>(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcvtq_f32_u32() {
-        let a: u32x4 = u32x4::new(1, 2, 3, 4);
-        let e: f32x4 = f32x4::new(1., 2., 3., 4.);
-        let r: f32x4 = transmute(vcvtq_f32_u32(transmute(a)));
+    unsafe fn test_vextq_s32() {
+        let a: i32x4 = i32x4::new(0, 8, 8, 9);
+        let b: i32x4 = i32x4::new(9, 11, 14, 15);
+        let e: i32x4 = i32x4::new(8, 9, 9, 11);
+        let r: i32x4 = transmute(vextq_s32::<2>(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcvt_n_f32_s32() {
-        let a: i32x2 = i32x2::new(1, 2);
-        let e: f32x2 = f32x2::new(0.25, 0.5);
-        let r: f32x2 = transmute(vcvt_n_f32_s32::<2>(transmute(a)));
+    unsafe fn test_vext_u8() {
+        let a: u8x8 = u8x8::new(0, 8, 8, 9, 8, 9, 9, 11);
+        let b: u8x8 = u8x8::new(9, 11, 14, 15, 16, 17, 18, 19);
+        let e: u8x8 = u8x8::new(8, 9, 9, 11, 9, 11, 14, 15);
+        let r: u8x8 = transmute(vext_u8::<4>(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcvtq_n_f32_s32() {
-        let a: i32x4 = i32x4::new(1, 2, 3, 4);
-        let e: f32x4 = f32x4::new(0.25, 0.5, 0.75, 1.);
-        let r: f32x4 = transmute(vcvtq_n_f32_s32::<2>(transmute(a)));
+    unsafe fn test_vextq_u8() {
+        let a: u8x16 = u8x16::new(0, 8, 8, 9, 8, 9, 9, 11, 8, 9, 9, 11, 9, 11, 14, 15);
+        let b: u8x16 = u8x16::new(9, 11, 14, 15, 16, 17, 18, 19, 0, 8, 8, 9, 8, 9, 9, 11);
+        let e: u8x16 = u8x16::new(8, 9, 9, 11, 9, 11, 14, 15, 9, 11, 14, 15, 16, 17, 18, 19);
+        let r: u8x16 = transmute(vextq_u8::<8>(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcvt_n_f32_u32() {
-        let a: u32x2 = u32x2::new(1, 2);
-        let e: f32x2 = f32x2::new(0.25, 0.5);
-        let r: f32x2 = transmute(vcvt_n_f32_u32::<2>(transmute(a)));
+    unsafe fn test_vext_u16() {
+        let a: u16x4 = u16x4::new(0, 8, 8, 9);
+        let b: u16x4 = u16x4::new(9, 11, 14, 15);
+        let e: u16x4 = u16x4::new(8, 9, 9, 11);
+        let r: u16x4 = transmute(vext_u16::<2>(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcvtq_n_f32_u32() {
-        let a: u32x4 = u32x4::new(1, 2, 3, 4);
-        let e: f32x4 = f32x4::new(0.25, 0.5, 0.75, 1.);
-        let r: f32x4 = transmute(vcvtq_n_f32_u32::<2>(transmute(a)));
+    unsafe fn test_vextq_u16() {
+        let a: u16x8 = u16x8::new(0, 8, 8, 9, 8, 9, 9, 11);
+        let b: u16x8 = u16x8::new(9, 11, 14, 15, 16, 17, 18, 19);
+        let e: u16x8 = u16x8::new(8, 9, 9, 11, 9, 11, 14, 15);
+        let r: u16x8 = transmute(vextq_u16::<4>(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcvt_n_s32_f32() {
-        let a: f32x2 = f32x2::new(0.25, 0.5);
-        let e: i32x2 = i32x2::new(1, 2);
-        let r: i32x2 = transmute(vcvt_n_s32_f32::<2>(transmute(a)));
+    unsafe fn test_vext_u32() {
+        let a: u32x2 = u32x2::new(0, 8);
+        let b: u32x2 = u32x2::new(9, 11);
+        let e: u32x2 = u32x2::new(8, 9);
+        let r: u32x2 = transmute(vext_u32::<1>(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcvtq_n_s32_f32() {
-        let a: f32x4 = f32x4::new(0.25, 0.5, 0.75, 1.);
-        let e: i32x4 = i32x4::new(1, 2, 3, 4);
-        let r: i32x4 = transmute(vcvtq_n_s32_f32::<2>(transmute(a)));
+    unsafe fn test_vextq_u32() {
+        let a: u32x4 = u32x4::new(0, 8, 8, 9);
+        let b: u32x4 = u32x4::new(9, 11, 14, 15);
+        let e: u32x4 = u32x4::new(8, 9, 9, 11);
+        let r: u32x4 = transmute(vextq_u32::<2>(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcvt_n_u32_f32() {
-        let a: f32x2 = f32x2::new(0.25, 0.5);
-        let e: u32x2 = u32x2::new(1, 2);
-        let r: u32x2 = transmute(vcvt_n_u32_f32::<2>(transmute(a)));
+    unsafe fn test_vext_p8() {
+        let a: i8x8 = i8x8::new(0, 8, 8, 9, 8, 9, 9, 11);
+        let b: i8x8 = i8x8::new(9, 11, 14, 15, 16, 17, 18, 19);
+        let e: i8x8 = i8x8::new(8, 9, 9, 11, 9, 11, 14, 15);
+        let r: i8x8 = transmute(vext_p8::<4>(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcvtq_n_u32_f32() {
-        let a: f32x4 = f32x4::new(0.25, 0.5, 0.75, 1.);
-        let e: u32x4 = u32x4::new(1, 2, 3, 4);
-        let r: u32x4 = transmute(vcvtq_n_u32_f32::<2>(transmute(a)));
+    unsafe fn test_vextq_p8() {
+        let a: i8x16 = i8x16::new(0, 8, 8, 9, 8, 9, 9, 11, 8, 9, 9, 11, 9, 11, 14, 15);
+        let b: i8x16 = i8x16::new(9, 11, 14, 15, 16, 17, 18, 19, 0, 8, 8, 9, 8, 9, 9, 11);
+        let e: i8x16 = i8x16::new(8, 9, 9, 11, 9, 11, 14, 15, 9, 11, 14, 15, 16, 17, 18, 19);
+        let r: i8x16 = transmute(vextq_p8::<8>(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcvt_s32_f32() {
-        let a: f32x2 = f32x2::new(-1.1, 2.1);
-        let e: i32x2 = i32x2::new(-1, 2);
-        let r: i32x2 = transmute(vcvt_s32_f32(transmute(a)));
+    unsafe fn test_vext_p16() {
+        let a: i16x4 = i16x4::new(0, 8, 8, 9);
+        let b: i16x4 = i16x4::new(9, 11, 14, 15);
+        let e: i16x4 = i16x4::new(8, 9, 9, 11);
+        let r: i16x4 = transmute(vext_p16::<2>(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcvtq_s32_f32() {
-        let a: f32x4 = f32x4::new(-1.1, 2.1, -2.9, 3.9);
-        let e: i32x4 = i32x4::new(-1, 2, -2, 3);
-        let r: i32x4 = transmute(vcvtq_s32_f32(transmute(a)));
+    unsafe fn test_vextq_p16() {
+        let a: i16x8 = i16x8::new(0, 8, 8, 9, 8, 9, 9, 11);
+        let b: i16x8 = i16x8::new(9, 11, 14, 15, 16, 17, 18, 19);
+        let e: i16x8 = i16x8::new(8, 9, 9, 11, 9, 11, 14, 15);
+        let r: i16x8 = transmute(vextq_p16::<4>(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcvt_u32_f32() {
-        let a: f32x2 = f32x2::new(1.1, 2.1);
-        let e: u32x2 = u32x2::new(1, 2);
-        let r: u32x2 = transmute(vcvt_u32_f32(transmute(a)));
+    unsafe fn test_vextq_s64() {
+        let a: i64x2 = i64x2::new(0, 8);
+        let b: i64x2 = i64x2::new(9, 11);
+        let e: i64x2 = i64x2::new(8, 9);
+        let r: i64x2 = transmute(vextq_s64::<1>(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcvtq_u32_f32() {
-        let a: f32x4 = f32x4::new(1.1, 2.1, 2.9, 3.9);
-        let e: u32x4 = u32x4::new(1, 2, 2, 3);
-        let r: u32x4 = transmute(vcvtq_u32_f32(transmute(a)));
+    unsafe fn test_vextq_u64() {
+        let a: u64x2 = u64x2::new(0, 8);
+        let b: u64x2 = u64x2::new(9, 11);
+        let e: u64x2 = u64x2::new(8, 9);
+        let r: u64x2 = transmute(vextq_u64::<1>(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdup_lane_s8() {
-        let a: i8x8 = i8x8::new(1, 1, 1, 4, 1, 6, 7, 8);
-        let e: i8x8 = i8x8::new(1, 1, 1, 1, 1, 1, 1, 1);
-        let r: i8x8 = transmute(vdup_lane_s8::<4>(transmute(a)));
+    unsafe fn test_vext_f32() {
+        let a: f32x2 = f32x2::new(0., 2.);
+        let b: f32x2 = f32x2::new(3., 4.);
+        let e: f32x2 = f32x2::new(2., 3.);
+        let r: f32x2 = transmute(vext_f32::<1>(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdupq_laneq_s8() {
-        let a: i8x16 = i8x16::new(1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16);
-        let e: i8x16 = i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
-        let r: i8x16 = transmute(vdupq_laneq_s8::<8>(transmute(a)));
+    unsafe fn test_vextq_f32() {
+        let a: f32x4 = f32x4::new(0., 2., 2., 3.);
+        let b: f32x4 = f32x4::new(3., 4., 5., 6.);
+        let e: f32x4 = f32x4::new(2., 3., 3., 4.);
+        let r: f32x4 = transmute(vextq_f32::<2>(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdup_lane_s16() {
-        let a: i16x4 = i16x4::new(1, 1, 1, 4);
-        let e: i16x4 = i16x4::new(1, 1, 1, 1);
-        let r: i16x4 = transmute(vdup_lane_s16::<2>(transmute(a)));
+    unsafe fn test_vmla_s8() {
+        let a: i8x8 = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: i8x8 = i8x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let c: i8x8 = i8x8::new(3, 3, 3, 3, 3, 3, 3, 3);
+        let e: i8x8 = i8x8::new(6, 7, 8, 9, 10, 11, 12, 13);
+        let r: i8x8 = transmute(vmla_s8(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdupq_laneq_s16() {
-        let a: i16x8 = i16x8::new(1, 1, 1, 4, 1, 6, 7, 8);
-        let e: i16x8 = i16x8::new(1, 1, 1, 1, 1, 1, 1, 1);
-        let r: i16x8 = transmute(vdupq_laneq_s16::<4>(transmute(a)));
+    unsafe fn test_vmlaq_s8() {
+        let a: i8x16 = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b: i8x16 = i8x16::new(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        let c: i8x16 = i8x16::new(3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3);
+        let e: i8x16 = i8x16::new(6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21);
+        let r: i8x16 = transmute(vmlaq_s8(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdup_lane_s32() {
-        let a: i32x2 = i32x2::new(1, 1);
-        let e: i32x2 = i32x2::new(1, 1);
-        let r: i32x2 = transmute(vdup_lane_s32::<1>(transmute(a)));
+    unsafe fn test_vmla_s16() {
+        let a: i16x4 = i16x4::new(0, 1, 2, 3);
+        let b: i16x4 = i16x4::new(2, 2, 2, 2);
+        let c: i16x4 = i16x4::new(3, 3, 3, 3);
+        let e: i16x4 = i16x4::new(6, 7, 8, 9);
+        let r: i16x4 = transmute(vmla_s16(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdupq_laneq_s32() {
-        let a: i32x4 = i32x4::new(1, 1, 1, 4);
-        let e: i32x4 = i32x4::new(1, 1, 1, 1);
-        let r: i32x4 = transmute(vdupq_laneq_s32::<2>(transmute(a)));
+    unsafe fn test_vmlaq_s16() {
+        let a: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: i16x8 = i16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let c: i16x8 = i16x8::new(3, 3, 3, 3, 3, 3, 3, 3);
+        let e: i16x8 = i16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
+        let r: i16x8 = transmute(vmlaq_s16(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdup_laneq_s8() {
-        let a: i8x16 = i8x16::new(1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16);
-        let e: i8x8 = i8x8::new(1, 1, 1, 1, 1, 1, 1, 1);
-        let r: i8x8 = transmute(vdup_laneq_s8::<8>(transmute(a)));
+    unsafe fn test_vmla_s32() {
+        let a: i32x2 = i32x2::new(0, 1);
+        let b: i32x2 = i32x2::new(2, 2);
+        let c: i32x2 = i32x2::new(3, 3);
+        let e: i32x2 = i32x2::new(6, 7);
+        let r: i32x2 = transmute(vmla_s32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdup_laneq_s16() {
-        let a: i16x8 = i16x8::new(1, 1, 1, 4, 1, 6, 7, 8);
-        let e: i16x4 = i16x4::new(1, 1, 1, 1);
-        let r: i16x4 = transmute(vdup_laneq_s16::<4>(transmute(a)));
+    unsafe fn test_vmlaq_s32() {
+        let a: i32x4 = i32x4::new(0, 1, 2, 3);
+        let b: i32x4 = i32x4::new(2, 2, 2, 2);
+        let c: i32x4 = i32x4::new(3, 3, 3, 3);
+        let e: i32x4 = i32x4::new(6, 7, 8, 9);
+        let r: i32x4 = transmute(vmlaq_s32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdup_laneq_s32() {
-        let a: i32x4 = i32x4::new(1, 1, 1, 4);
-        let e: i32x2 = i32x2::new(1, 1);
-        let r: i32x2 = transmute(vdup_laneq_s32::<2>(transmute(a)));
+    unsafe fn test_vmla_u8() {
+        let a: u8x8 = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: u8x8 = u8x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let c: u8x8 = u8x8::new(3, 3, 3, 3, 3, 3, 3, 3);
+        let e: u8x8 = u8x8::new(6, 7, 8, 9, 10, 11, 12, 13);
+        let r: u8x8 = transmute(vmla_u8(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdupq_lane_s8() {
-        let a: i8x8 = i8x8::new(1, 1, 1, 4, 1, 6, 7, 8);
-        let e: i8x16 = i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
-        let r: i8x16 = transmute(vdupq_lane_s8::<4>(transmute(a)));
+    unsafe fn test_vmlaq_u8() {
+        let a: u8x16 = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b: u8x16 = u8x16::new(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        let c: u8x16 = u8x16::new(3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3);
+        let e: u8x16 = u8x16::new(6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21);
+        let r: u8x16 = transmute(vmlaq_u8(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdupq_lane_s16() {
-        let a: i16x4 = i16x4::new(1, 1, 1, 4);
-        let e: i16x8 = i16x8::new(1, 1, 1, 1, 1, 1, 1, 1);
-        let r: i16x8 = transmute(vdupq_lane_s16::<2>(transmute(a)));
+    unsafe fn test_vmla_u16() {
+        let a: u16x4 = u16x4::new(0, 1, 2, 3);
+        let b: u16x4 = u16x4::new(2, 2, 2, 2);
+        let c: u16x4 = u16x4::new(3, 3, 3, 3);
+        let e: u16x4 = u16x4::new(6, 7, 8, 9);
+        let r: u16x4 = transmute(vmla_u16(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdupq_lane_s32() {
-        let a: i32x2 = i32x2::new(1, 1);
-        let e: i32x4 = i32x4::new(1, 1, 1, 1);
-        let r: i32x4 = transmute(vdupq_lane_s32::<1>(transmute(a)));
+    unsafe fn test_vmlaq_u16() {
+        let a: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: u16x8 = u16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let c: u16x8 = u16x8::new(3, 3, 3, 3, 3, 3, 3, 3);
+        let e: u16x8 = u16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
+        let r: u16x8 = transmute(vmlaq_u16(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdup_lane_u8() {
-        let a: u8x8 = u8x8::new(1, 1, 1, 4, 1, 6, 7, 8);
-        let e: u8x8 = u8x8::new(1, 1, 1, 1, 1, 1, 1, 1);
-        let r: u8x8 = transmute(vdup_lane_u8::<4>(transmute(a)));
+    unsafe fn test_vmla_u32() {
+        let a: u32x2 = u32x2::new(0, 1);
+        let b: u32x2 = u32x2::new(2, 2);
+        let c: u32x2 = u32x2::new(3, 3);
+        let e: u32x2 = u32x2::new(6, 7);
+        let r: u32x2 = transmute(vmla_u32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdupq_laneq_u8() {
-        let a: u8x16 = u8x16::new(1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16);
-        let e: u8x16 = u8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
-        let r: u8x16 = transmute(vdupq_laneq_u8::<8>(transmute(a)));
+    unsafe fn test_vmlaq_u32() {
+        let a: u32x4 = u32x4::new(0, 1, 2, 3);
+        let b: u32x4 = u32x4::new(2, 2, 2, 2);
+        let c: u32x4 = u32x4::new(3, 3, 3, 3);
+        let e: u32x4 = u32x4::new(6, 7, 8, 9);
+        let r: u32x4 = transmute(vmlaq_u32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
-
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vdup_lane_u16() {
-        let a: u16x4 = u16x4::new(1, 1, 1, 4);
-        let e: u16x4 = u16x4::new(1, 1, 1, 1);
-        let r: u16x4 = transmute(vdup_lane_u16::<2>(transmute(a)));
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmla_f32() {
+        let a: f32x2 = f32x2::new(0., 1.);
+        let b: f32x2 = f32x2::new(2., 2.);
+        let c: f32x2 = f32x2::new(3., 3.);
+        let e: f32x2 = f32x2::new(6., 7.);
+        let r: f32x2 = transmute(vmla_f32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdupq_laneq_u16() {
-        let a: u16x8 = u16x8::new(1, 1, 1, 4, 1, 6, 7, 8);
-        let e: u16x8 = u16x8::new(1, 1, 1, 1, 1, 1, 1, 1);
-        let r: u16x8 = transmute(vdupq_laneq_u16::<4>(transmute(a)));
+    unsafe fn test_vmlaq_f32() {
+        let a: f32x4 = f32x4::new(0., 1., 2., 3.);
+        let b: f32x4 = f32x4::new(2., 2., 2., 2.);
+        let c: f32x4 = f32x4::new(3., 3., 3., 3.);
+        let e: f32x4 = f32x4::new(6., 7., 8., 9.);
+        let r: f32x4 = transmute(vmlaq_f32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdup_lane_u32() {
-        let a: u32x2 = u32x2::new(1, 1);
-        let e: u32x2 = u32x2::new(1, 1);
-        let r: u32x2 = transmute(vdup_lane_u32::<1>(transmute(a)));
+    unsafe fn test_vmla_n_s16() {
+        let a: i16x4 = i16x4::new(0, 1, 2, 3);
+        let b: i16x4 = i16x4::new(2, 2, 2, 2);
+        let c: i16 = 3;
+        let e: i16x4 = i16x4::new(6, 7, 8, 9);
+        let r: i16x4 = transmute(vmla_n_s16(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdupq_laneq_u32() {
-        let a: u32x4 = u32x4::new(1, 1, 1, 4);
-        let e: u32x4 = u32x4::new(1, 1, 1, 1);
-        let r: u32x4 = transmute(vdupq_laneq_u32::<2>(transmute(a)));
+    unsafe fn test_vmlaq_n_s16() {
+        let a: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: i16x8 = i16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let c: i16 = 3;
+        let e: i16x8 = i16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
+        let r: i16x8 = transmute(vmlaq_n_s16(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdup_laneq_u8() {
-        let a: u8x16 = u8x16::new(1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16);
-        let e: u8x8 = u8x8::new(1, 1, 1, 1, 1, 1, 1, 1);
-        let r: u8x8 = transmute(vdup_laneq_u8::<8>(transmute(a)));
+    unsafe fn test_vmla_n_s32() {
+        let a: i32x2 = i32x2::new(0, 1);
+        let b: i32x2 = i32x2::new(2, 2);
+        let c: i32 = 3;
+        let e: i32x2 = i32x2::new(6, 7);
+        let r: i32x2 = transmute(vmla_n_s32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdup_laneq_u16() {
-        let a: u16x8 = u16x8::new(1, 1, 1, 4, 1, 6, 7, 8);
-        let e: u16x4 = u16x4::new(1, 1, 1, 1);
-        let r: u16x4 = transmute(vdup_laneq_u16::<4>(transmute(a)));
+    unsafe fn test_vmlaq_n_s32() {
+        let a: i32x4 = i32x4::new(0, 1, 2, 3);
+        let b: i32x4 = i32x4::new(2, 2, 2, 2);
+        let c: i32 = 3;
+        let e: i32x4 = i32x4::new(6, 7, 8, 9);
+        let r: i32x4 = transmute(vmlaq_n_s32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdup_laneq_u32() {
-        let a: u32x4 = u32x4::new(1, 1, 1, 4);
-        let e: u32x2 = u32x2::new(1, 1);
-        let r: u32x2 = transmute(vdup_laneq_u32::<2>(transmute(a)));
+    unsafe fn test_vmla_n_u16() {
+        let a: u16x4 = u16x4::new(0, 1, 2, 3);
+        let b: u16x4 = u16x4::new(2, 2, 2, 2);
+        let c: u16 = 3;
+        let e: u16x4 = u16x4::new(6, 7, 8, 9);
+        let r: u16x4 = transmute(vmla_n_u16(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdupq_lane_u8() {
-        let a: u8x8 = u8x8::new(1, 1, 1, 4, 1, 6, 7, 8);
-        let e: u8x16 = u8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
-        let r: u8x16 = transmute(vdupq_lane_u8::<4>(transmute(a)));
+    unsafe fn test_vmlaq_n_u16() {
+        let a: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: u16x8 = u16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let c: u16 = 3;
+        let e: u16x8 = u16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
+        let r: u16x8 = transmute(vmlaq_n_u16(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdupq_lane_u16() {
-        let a: u16x4 = u16x4::new(1, 1, 1, 4);
-        let e: u16x8 = u16x8::new(1, 1, 1, 1, 1, 1, 1, 1);
-        let r: u16x8 = transmute(vdupq_lane_u16::<2>(transmute(a)));
+    unsafe fn test_vmla_n_u32() {
+        let a: u32x2 = u32x2::new(0, 1);
+        let b: u32x2 = u32x2::new(2, 2);
+        let c: u32 = 3;
+        let e: u32x2 = u32x2::new(6, 7);
+        let r: u32x2 = transmute(vmla_n_u32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdupq_lane_u32() {
-        let a: u32x2 = u32x2::new(1, 1);
-        let e: u32x4 = u32x4::new(1, 1, 1, 1);
-        let r: u32x4 = transmute(vdupq_lane_u32::<1>(transmute(a)));
+    unsafe fn test_vmlaq_n_u32() {
+        let a: u32x4 = u32x4::new(0, 1, 2, 3);
+        let b: u32x4 = u32x4::new(2, 2, 2, 2);
+        let c: u32 = 3;
+        let e: u32x4 = u32x4::new(6, 7, 8, 9);
+        let r: u32x4 = transmute(vmlaq_n_u32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdup_lane_p8() {
-        let a: i8x8 = i8x8::new(1, 1, 1, 4, 1, 6, 7, 8);
-        let e: i8x8 = i8x8::new(1, 1, 1, 1, 1, 1, 1, 1);
-        let r: i8x8 = transmute(vdup_lane_p8::<4>(transmute(a)));
+    unsafe fn test_vmla_n_f32() {
+        let a: f32x2 = f32x2::new(0., 1.);
+        let b: f32x2 = f32x2::new(2., 2.);
+        let c: f32 = 3.;
+        let e: f32x2 = f32x2::new(6., 7.);
+        let r: f32x2 = transmute(vmla_n_f32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdupq_laneq_p8() {
-        let a: i8x16 = i8x16::new(1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16);
-        let e: i8x16 = i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
-        let r: i8x16 = transmute(vdupq_laneq_p8::<8>(transmute(a)));
+    unsafe fn test_vmlaq_n_f32() {
+        let a: f32x4 = f32x4::new(0., 1., 2., 3.);
+        let b: f32x4 = f32x4::new(2., 2., 2., 2.);
+        let c: f32 = 3.;
+        let e: f32x4 = f32x4::new(6., 7., 8., 9.);
+        let r: f32x4 = transmute(vmlaq_n_f32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdup_lane_p16() {
-        let a: i16x4 = i16x4::new(1, 1, 1, 4);
-        let e: i16x4 = i16x4::new(1, 1, 1, 1);
-        let r: i16x4 = transmute(vdup_lane_p16::<2>(transmute(a)));
+    unsafe fn test_vmla_lane_s16() {
+        let a: i16x4 = i16x4::new(0, 1, 2, 3);
+        let b: i16x4 = i16x4::new(2, 2, 2, 2);
+        let c: i16x4 = i16x4::new(0, 3, 0, 0);
+        let e: i16x4 = i16x4::new(6, 7, 8, 9);
+        let r: i16x4 = transmute(vmla_lane_s16::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdupq_laneq_p16() {
-        let a: i16x8 = i16x8::new(1, 1, 1, 4, 1, 6, 7, 8);
-        let e: i16x8 = i16x8::new(1, 1, 1, 1, 1, 1, 1, 1);
-        let r: i16x8 = transmute(vdupq_laneq_p16::<4>(transmute(a)));
+    unsafe fn test_vmla_laneq_s16() {
+        let a: i16x4 = i16x4::new(0, 1, 2, 3);
+        let b: i16x4 = i16x4::new(2, 2, 2, 2);
+        let c: i16x8 = i16x8::new(0, 3, 0, 0, 0, 0, 0, 0);
+        let e: i16x4 = i16x4::new(6, 7, 8, 9);
+        let r: i16x4 = transmute(vmla_laneq_s16::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdup_laneq_p8() {
-        let a: i8x16 = i8x16::new(1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16);
-        let e: i8x8 = i8x8::new(1, 1, 1, 1, 1, 1, 1, 1);
-        let r: i8x8 = transmute(vdup_laneq_p8::<8>(transmute(a)));
+    unsafe fn test_vmlaq_lane_s16() {
+        let a: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: i16x8 = i16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let c: i16x4 = i16x4::new(0, 3, 0, 0);
+        let e: i16x8 = i16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
+        let r: i16x8 = transmute(vmlaq_lane_s16::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdup_laneq_p16() {
-        let a: i16x8 = i16x8::new(1, 1, 1, 4, 1, 6, 7, 8);
-        let e: i16x4 = i16x4::new(1, 1, 1, 1);
-        let r: i16x4 = transmute(vdup_laneq_p16::<4>(transmute(a)));
+    unsafe fn test_vmlaq_laneq_s16() {
+        let a: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: i16x8 = i16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let c: i16x8 = i16x8::new(0, 3, 0, 0, 0, 0, 0, 0);
+        let e: i16x8 = i16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
+        let r: i16x8 = transmute(vmlaq_laneq_s16::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdupq_lane_p8() {
-        let a: i8x8 = i8x8::new(1, 1, 1, 4, 1, 6, 7, 8);
-        let e: i8x16 = i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
-        let r: i8x16 = transmute(vdupq_lane_p8::<4>(transmute(a)));
+    unsafe fn test_vmla_lane_s32() {
+        let a: i32x2 = i32x2::new(0, 1);
+        let b: i32x2 = i32x2::new(2, 2);
+        let c: i32x2 = i32x2::new(0, 3);
+        let e: i32x2 = i32x2::new(6, 7);
+        let r: i32x2 = transmute(vmla_lane_s32::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdupq_lane_p16() {
-        let a: i16x4 = i16x4::new(1, 1, 1, 4);
-        let e: i16x8 = i16x8::new(1, 1, 1, 1, 1, 1, 1, 1);
-        let r: i16x8 = transmute(vdupq_lane_p16::<2>(transmute(a)));
+    unsafe fn test_vmla_laneq_s32() {
+        let a: i32x2 = i32x2::new(0, 1);
+        let b: i32x2 = i32x2::new(2, 2);
+        let c: i32x4 = i32x4::new(0, 3, 0, 0);
+        let e: i32x2 = i32x2::new(6, 7);
+        let r: i32x2 = transmute(vmla_laneq_s32::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdupq_laneq_s64() {
-        let a: i64x2 = i64x2::new(1, 1);
-        let e: i64x2 = i64x2::new(1, 1);
-        let r: i64x2 = transmute(vdupq_laneq_s64::<1>(transmute(a)));
+    unsafe fn test_vmlaq_lane_s32() {
+        let a: i32x4 = i32x4::new(0, 1, 2, 3);
+        let b: i32x4 = i32x4::new(2, 2, 2, 2);
+        let c: i32x2 = i32x2::new(0, 3);
+        let e: i32x4 = i32x4::new(6, 7, 8, 9);
+        let r: i32x4 = transmute(vmlaq_lane_s32::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdupq_lane_s64() {
-        let a: i64x1 = i64x1::new(1);
-        let e: i64x2 = i64x2::new(1, 1);
-        let r: i64x2 = transmute(vdupq_lane_s64::<0>(transmute(a)));
+    unsafe fn test_vmlaq_laneq_s32() {
+        let a: i32x4 = i32x4::new(0, 1, 2, 3);
+        let b: i32x4 = i32x4::new(2, 2, 2, 2);
+        let c: i32x4 = i32x4::new(0, 3, 0, 0);
+        let e: i32x4 = i32x4::new(6, 7, 8, 9);
+        let r: i32x4 = transmute(vmlaq_laneq_s32::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdupq_laneq_u64() {
-        let a: u64x2 = u64x2::new(1, 1);
-        let e: u64x2 = u64x2::new(1, 1);
-        let r: u64x2 = transmute(vdupq_laneq_u64::<1>(transmute(a)));
+    unsafe fn test_vmla_lane_u16() {
+        let a: u16x4 = u16x4::new(0, 1, 2, 3);
+        let b: u16x4 = u16x4::new(2, 2, 2, 2);
+        let c: u16x4 = u16x4::new(0, 3, 0, 0);
+        let e: u16x4 = u16x4::new(6, 7, 8, 9);
+        let r: u16x4 = transmute(vmla_lane_u16::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdupq_lane_u64() {
-        let a: u64x1 = u64x1::new(1);
-        let e: u64x2 = u64x2::new(1, 1);
-        let r: u64x2 = transmute(vdupq_lane_u64::<0>(transmute(a)));
+    unsafe fn test_vmla_laneq_u16() {
+        let a: u16x4 = u16x4::new(0, 1, 2, 3);
+        let b: u16x4 = u16x4::new(2, 2, 2, 2);
+        let c: u16x8 = u16x8::new(0, 3, 0, 0, 0, 0, 0, 0);
+        let e: u16x4 = u16x4::new(6, 7, 8, 9);
+        let r: u16x4 = transmute(vmla_laneq_u16::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdup_lane_f32() {
-        let a: f32x2 = f32x2::new(1., 1.);
-        let e: f32x2 = f32x2::new(1., 1.);
-        let r: f32x2 = transmute(vdup_lane_f32::<1>(transmute(a)));
+    unsafe fn test_vmlaq_lane_u16() {
+        let a: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: u16x8 = u16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let c: u16x4 = u16x4::new(0, 3, 0, 0);
+        let e: u16x8 = u16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
+        let r: u16x8 = transmute(vmlaq_lane_u16::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdupq_laneq_f32() {
-        let a: f32x4 = f32x4::new(1., 1., 1., 4.);
-        let e: f32x4 = f32x4::new(1., 1., 1., 1.);
-        let r: f32x4 = transmute(vdupq_laneq_f32::<2>(transmute(a)));
+    unsafe fn test_vmlaq_laneq_u16() {
+        let a: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: u16x8 = u16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let c: u16x8 = u16x8::new(0, 3, 0, 0, 0, 0, 0, 0);
+        let e: u16x8 = u16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
+        let r: u16x8 = transmute(vmlaq_laneq_u16::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdup_laneq_f32() {
-        let a: f32x4 = f32x4::new(1., 1., 1., 4.);
-        let e: f32x2 = f32x2::new(1., 1.);
-        let r: f32x2 = transmute(vdup_laneq_f32::<2>(transmute(a)));
+    unsafe fn test_vmla_lane_u32() {
+        let a: u32x2 = u32x2::new(0, 1);
+        let b: u32x2 = u32x2::new(2, 2);
+        let c: u32x2 = u32x2::new(0, 3);
+        let e: u32x2 = u32x2::new(6, 7);
+        let r: u32x2 = transmute(vmla_lane_u32::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdupq_lane_f32() {
-        let a: f32x2 = f32x2::new(1., 1.);
-        let e: f32x4 = f32x4::new(1., 1., 1., 1.);
-        let r: f32x4 = transmute(vdupq_lane_f32::<1>(transmute(a)));
+    unsafe fn test_vmla_laneq_u32() {
+        let a: u32x2 = u32x2::new(0, 1);
+        let b: u32x2 = u32x2::new(2, 2);
+        let c: u32x4 = u32x4::new(0, 3, 0, 0);
+        let e: u32x2 = u32x2::new(6, 7);
+        let r: u32x2 = transmute(vmla_laneq_u32::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdup_lane_s64() {
-        let a: i64x1 = i64x1::new(0);
-        let e: i64x1 = i64x1::new(0);
-        let r: i64x1 = transmute(vdup_lane_s64::<0>(transmute(a)));
+    unsafe fn test_vmlaq_lane_u32() {
+        let a: u32x4 = u32x4::new(0, 1, 2, 3);
+        let b: u32x4 = u32x4::new(2, 2, 2, 2);
+        let c: u32x2 = u32x2::new(0, 3);
+        let e: u32x4 = u32x4::new(6, 7, 8, 9);
+        let r: u32x4 = transmute(vmlaq_lane_u32::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdup_lane_u64() {
-        let a: u64x1 = u64x1::new(0);
-        let e: u64x1 = u64x1::new(0);
-        let r: u64x1 = transmute(vdup_lane_u64::<0>(transmute(a)));
+    unsafe fn test_vmlaq_laneq_u32() {
+        let a: u32x4 = u32x4::new(0, 1, 2, 3);
+        let b: u32x4 = u32x4::new(2, 2, 2, 2);
+        let c: u32x4 = u32x4::new(0, 3, 0, 0);
+        let e: u32x4 = u32x4::new(6, 7, 8, 9);
+        let r: u32x4 = transmute(vmlaq_laneq_u32::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdup_laneq_s64() {
-        let a: i64x2 = i64x2::new(0, 1);
-        let e: i64x1 = i64x1::new(1);
-        let r: i64x1 = transmute(vdup_laneq_s64::<1>(transmute(a)));
+    unsafe fn test_vmla_lane_f32() {
+        let a: f32x2 = f32x2::new(0., 1.);
+        let b: f32x2 = f32x2::new(2., 2.);
+        let c: f32x2 = f32x2::new(0., 3.);
+        let e: f32x2 = f32x2::new(6., 7.);
+        let r: f32x2 = transmute(vmla_lane_f32::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdup_laneq_u64() {
-        let a: u64x2 = u64x2::new(0, 1);
-        let e: u64x1 = u64x1::new(1);
-        let r: u64x1 = transmute(vdup_laneq_u64::<1>(transmute(a)));
+    unsafe fn test_vmla_laneq_f32() {
+        let a: f32x2 = f32x2::new(0., 1.);
+        let b: f32x2 = f32x2::new(2., 2.);
+        let c: f32x4 = f32x4::new(0., 3., 0., 0.);
+        let e: f32x2 = f32x2::new(6., 7.);
+        let r: f32x2 = transmute(vmla_laneq_f32::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vext_s8() {
-        let a: i8x8 = i8x8::new(0, 8, 8, 9, 8, 9, 9, 11);
-        let b: i8x8 = i8x8::new(9, 11, 14, 15, 16, 17, 18, 19);
-        let e: i8x8 = i8x8::new(8, 9, 9, 11, 9, 11, 14, 15);
-        let r: i8x8 = transmute(vext_s8::<4>(transmute(a), transmute(b)));
+    unsafe fn test_vmlaq_lane_f32() {
+        let a: f32x4 = f32x4::new(0., 1., 2., 3.);
+        let b: f32x4 = f32x4::new(2., 2., 2., 2.);
+        let c: f32x2 = f32x2::new(0., 3.);
+        let e: f32x4 = f32x4::new(6., 7., 8., 9.);
+        let r: f32x4 = transmute(vmlaq_lane_f32::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vextq_s8() {
-        let a: i8x16 = i8x16::new(0, 8, 8, 9, 8, 9, 9, 11, 8, 9, 9, 11, 9, 11, 14, 15);
-        let b: i8x16 = i8x16::new(9, 11, 14, 15, 16, 17, 18, 19, 0, 8, 8, 9, 8, 9, 9, 11);
-        let e: i8x16 = i8x16::new(8, 9, 9, 11, 9, 11, 14, 15, 9, 11, 14, 15, 16, 17, 18, 19);
-        let r: i8x16 = transmute(vextq_s8::<8>(transmute(a), transmute(b)));
+    unsafe fn test_vmlaq_laneq_f32() {
+        let a: f32x4 = f32x4::new(0., 1., 2., 3.);
+        let b: f32x4 = f32x4::new(2., 2., 2., 2.);
+        let c: f32x4 = f32x4::new(0., 3., 0., 0.);
+        let e: f32x4 = f32x4::new(6., 7., 8., 9.);
+        let r: f32x4 = transmute(vmlaq_laneq_f32::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vext_s16() {
-        let a: i16x4 = i16x4::new(0, 8, 8, 9);
-        let b: i16x4 = i16x4::new(9, 11, 14, 15);
-        let e: i16x4 = i16x4::new(8, 9, 9, 11);
-        let r: i16x4 = transmute(vext_s16::<2>(transmute(a), transmute(b)));
+    unsafe fn test_vmlal_s8() {
+        let a: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: i8x8 = i8x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let c: i8x8 = i8x8::new(3, 3, 3, 3, 3, 3, 3, 3);
+        let e: i16x8 = i16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
+        let r: i16x8 = transmute(vmlal_s8(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vextq_s16() {
-        let a: i16x8 = i16x8::new(0, 8, 8, 9, 8, 9, 9, 11);
-        let b: i16x8 = i16x8::new(9, 11, 14, 15, 16, 17, 18, 19);
-        let e: i16x8 = i16x8::new(8, 9, 9, 11, 9, 11, 14, 15);
-        let r: i16x8 = transmute(vextq_s16::<4>(transmute(a), transmute(b)));
+    unsafe fn test_vmlal_s16() {
+        let a: i32x4 = i32x4::new(0, 1, 2, 3);
+        let b: i16x4 = i16x4::new(2, 2, 2, 2);
+        let c: i16x4 = i16x4::new(3, 3, 3, 3);
+        let e: i32x4 = i32x4::new(6, 7, 8, 9);
+        let r: i32x4 = transmute(vmlal_s16(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vext_s32() {
-        let a: i32x2 = i32x2::new(0, 8);
-        let b: i32x2 = i32x2::new(9, 11);
-        let e: i32x2 = i32x2::new(8, 9);
-        let r: i32x2 = transmute(vext_s32::<1>(transmute(a), transmute(b)));
+    unsafe fn test_vmlal_s32() {
+        let a: i64x2 = i64x2::new(0, 1);
+        let b: i32x2 = i32x2::new(2, 2);
+        let c: i32x2 = i32x2::new(3, 3);
+        let e: i64x2 = i64x2::new(6, 7);
+        let r: i64x2 = transmute(vmlal_s32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vextq_s32() {
-        let a: i32x4 = i32x4::new(0, 8, 8, 9);
-        let b: i32x4 = i32x4::new(9, 11, 14, 15);
-        let e: i32x4 = i32x4::new(8, 9, 9, 11);
-        let r: i32x4 = transmute(vextq_s32::<2>(transmute(a), transmute(b)));
+    unsafe fn test_vmlal_u8() {
+        let a: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: u8x8 = u8x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let c: u8x8 = u8x8::new(3, 3, 3, 3, 3, 3, 3, 3);
+        let e: u16x8 = u16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
+        let r: u16x8 = transmute(vmlal_u8(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vext_u8() {
-        let a: u8x8 = u8x8::new(0, 8, 8, 9, 8, 9, 9, 11);
-        let b: u8x8 = u8x8::new(9, 11, 14, 15, 16, 17, 18, 19);
-        let e: u8x8 = u8x8::new(8, 9, 9, 11, 9, 11, 14, 15);
-        let r: u8x8 = transmute(vext_u8::<4>(transmute(a), transmute(b)));
+    unsafe fn test_vmlal_u16() {
+        let a: u32x4 = u32x4::new(0, 1, 2, 3);
+        let b: u16x4 = u16x4::new(2, 2, 2, 2);
+        let c: u16x4 = u16x4::new(3, 3, 3, 3);
+        let e: u32x4 = u32x4::new(6, 7, 8, 9);
+        let r: u32x4 = transmute(vmlal_u16(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vextq_u8() {
-        let a: u8x16 = u8x16::new(0, 8, 8, 9, 8, 9, 9, 11, 8, 9, 9, 11, 9, 11, 14, 15);
-        let b: u8x16 = u8x16::new(9, 11, 14, 15, 16, 17, 18, 19, 0, 8, 8, 9, 8, 9, 9, 11);
-        let e: u8x16 = u8x16::new(8, 9, 9, 11, 9, 11, 14, 15, 9, 11, 14, 15, 16, 17, 18, 19);
-        let r: u8x16 = transmute(vextq_u8::<8>(transmute(a), transmute(b)));
+    unsafe fn test_vmlal_u32() {
+        let a: u64x2 = u64x2::new(0, 1);
+        let b: u32x2 = u32x2::new(2, 2);
+        let c: u32x2 = u32x2::new(3, 3);
+        let e: u64x2 = u64x2::new(6, 7);
+        let r: u64x2 = transmute(vmlal_u32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vext_u16() {
-        let a: u16x4 = u16x4::new(0, 8, 8, 9);
-        let b: u16x4 = u16x4::new(9, 11, 14, 15);
-        let e: u16x4 = u16x4::new(8, 9, 9, 11);
-        let r: u16x4 = transmute(vext_u16::<2>(transmute(a), transmute(b)));
+    unsafe fn test_vmlal_n_s16() {
+        let a: i32x4 = i32x4::new(0, 1, 2, 3);
+        let b: i16x4 = i16x4::new(2, 2, 2, 2);
+        let c: i16 = 3;
+        let e: i32x4 = i32x4::new(6, 7, 8, 9);
+        let r: i32x4 = transmute(vmlal_n_s16(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vextq_u16() {
-        let a: u16x8 = u16x8::new(0, 8, 8, 9, 8, 9, 9, 11);
-        let b: u16x8 = u16x8::new(9, 11, 14, 15, 16, 17, 18, 19);
-        let e: u16x8 = u16x8::new(8, 9, 9, 11, 9, 11, 14, 15);
-        let r: u16x8 = transmute(vextq_u16::<4>(transmute(a), transmute(b)));
+    unsafe fn test_vmlal_n_s32() {
+        let a: i64x2 = i64x2::new(0, 1);
+        let b: i32x2 = i32x2::new(2, 2);
+        let c: i32 = 3;
+        let e: i64x2 = i64x2::new(6, 7);
+        let r: i64x2 = transmute(vmlal_n_s32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vext_u32() {
-        let a: u32x2 = u32x2::new(0, 8);
-        let b: u32x2 = u32x2::new(9, 11);
-        let e: u32x2 = u32x2::new(8, 9);
-        let r: u32x2 = transmute(vext_u32::<1>(transmute(a), transmute(b)));
+    unsafe fn test_vmlal_n_u16() {
+        let a: u32x4 = u32x4::new(0, 1, 2, 3);
+        let b: u16x4 = u16x4::new(2, 2, 2, 2);
+        let c: u16 = 3;
+        let e: u32x4 = u32x4::new(6, 7, 8, 9);
+        let r: u32x4 = transmute(vmlal_n_u16(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vextq_u32() {
-        let a: u32x4 = u32x4::new(0, 8, 8, 9);
-        let b: u32x4 = u32x4::new(9, 11, 14, 15);
-        let e: u32x4 = u32x4::new(8, 9, 9, 11);
-        let r: u32x4 = transmute(vextq_u32::<2>(transmute(a), transmute(b)));
+    unsafe fn test_vmlal_n_u32() {
+        let a: u64x2 = u64x2::new(0, 1);
+        let b: u32x2 = u32x2::new(2, 2);
+        let c: u32 = 3;
+        let e: u64x2 = u64x2::new(6, 7);
+        let r: u64x2 = transmute(vmlal_n_u32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vext_p8() {
-        let a: i8x8 = i8x8::new(0, 8, 8, 9, 8, 9, 9, 11);
-        let b: i8x8 = i8x8::new(9, 11, 14, 15, 16, 17, 18, 19);
-        let e: i8x8 = i8x8::new(8, 9, 9, 11, 9, 11, 14, 15);
-        let r: i8x8 = transmute(vext_p8::<4>(transmute(a), transmute(b)));
+    unsafe fn test_vmlal_lane_s16() {
+        let a: i32x4 = i32x4::new(0, 1, 2, 3);
+        let b: i16x4 = i16x4::new(2, 2, 2, 2);
+        let c: i16x4 = i16x4::new(0, 3, 0, 0);
+        let e: i32x4 = i32x4::new(6, 7, 8, 9);
+        let r: i32x4 = transmute(vmlal_lane_s16::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vextq_p8() {
-        let a: i8x16 = i8x16::new(0, 8, 8, 9, 8, 9, 9, 11, 8, 9, 9, 11, 9, 11, 14, 15);
-        let b: i8x16 = i8x16::new(9, 11, 14, 15, 16, 17, 18, 19, 0, 8, 8, 9, 8, 9, 9, 11);
-        let e: i8x16 = i8x16::new(8, 9, 9, 11, 9, 11, 14, 15, 9, 11, 14, 15, 16, 17, 18, 19);
-        let r: i8x16 = transmute(vextq_p8::<8>(transmute(a), transmute(b)));
+    unsafe fn test_vmlal_laneq_s16() {
+        let a: i32x4 = i32x4::new(0, 1, 2, 3);
+        let b: i16x4 = i16x4::new(2, 2, 2, 2);
+        let c: i16x8 = i16x8::new(0, 3, 0, 0, 0, 0, 0, 0);
+        let e: i32x4 = i32x4::new(6, 7, 8, 9);
+        let r: i32x4 = transmute(vmlal_laneq_s16::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vext_p16() {
-        let a: i16x4 = i16x4::new(0, 8, 8, 9);
-        let b: i16x4 = i16x4::new(9, 11, 14, 15);
-        let e: i16x4 = i16x4::new(8, 9, 9, 11);
-        let r: i16x4 = transmute(vext_p16::<2>(transmute(a), transmute(b)));
+    unsafe fn test_vmlal_lane_s32() {
+        let a: i64x2 = i64x2::new(0, 1);
+        let b: i32x2 = i32x2::new(2, 2);
+        let c: i32x2 = i32x2::new(0, 3);
+        let e: i64x2 = i64x2::new(6, 7);
+        let r: i64x2 = transmute(vmlal_lane_s32::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vextq_p16() {
-        let a: i16x8 = i16x8::new(0, 8, 8, 9, 8, 9, 9, 11);
-        let b: i16x8 = i16x8::new(9, 11, 14, 15, 16, 17, 18, 19);
-        let e: i16x8 = i16x8::new(8, 9, 9, 11, 9, 11, 14, 15);
-        let r: i16x8 = transmute(vextq_p16::<4>(transmute(a), transmute(b)));
+    unsafe fn test_vmlal_laneq_s32() {
+        let a: i64x2 = i64x2::new(0, 1);
+        let b: i32x2 = i32x2::new(2, 2);
+        let c: i32x4 = i32x4::new(0, 3, 0, 0);
+        let e: i64x2 = i64x2::new(6, 7);
+        let r: i64x2 = transmute(vmlal_laneq_s32::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vextq_s64() {
-        let a: i64x2 = i64x2::new(0, 8);
-        let b: i64x2 = i64x2::new(9, 11);
-        let e: i64x2 = i64x2::new(8, 9);
-        let r: i64x2 = transmute(vextq_s64::<1>(transmute(a), transmute(b)));
+    unsafe fn test_vmlal_lane_u16() {
+        let a: u32x4 = u32x4::new(0, 1, 2, 3);
+        let b: u16x4 = u16x4::new(2, 2, 2, 2);
+        let c: u16x4 = u16x4::new(0, 3, 0, 0);
+        let e: u32x4 = u32x4::new(6, 7, 8, 9);
+        let r: u32x4 = transmute(vmlal_lane_u16::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vextq_u64() {
-        let a: u64x2 = u64x2::new(0, 8);
-        let b: u64x2 = u64x2::new(9, 11);
-        let e: u64x2 = u64x2::new(8, 9);
-        let r: u64x2 = transmute(vextq_u64::<1>(transmute(a), transmute(b)));
+    unsafe fn test_vmlal_laneq_u16() {
+        let a: u32x4 = u32x4::new(0, 1, 2, 3);
+        let b: u16x4 = u16x4::new(2, 2, 2, 2);
+        let c: u16x8 = u16x8::new(0, 3, 0, 0, 0, 0, 0, 0);
+        let e: u32x4 = u32x4::new(6, 7, 8, 9);
+        let r: u32x4 = transmute(vmlal_laneq_u16::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vext_f32() {
-        let a: f32x2 = f32x2::new(0., 2.);
-        let b: f32x2 = f32x2::new(3., 4.);
-        let e: f32x2 = f32x2::new(2., 3.);
-        let r: f32x2 = transmute(vext_f32::<1>(transmute(a), transmute(b)));
+    unsafe fn test_vmlal_lane_u32() {
+        let a: u64x2 = u64x2::new(0, 1);
+        let b: u32x2 = u32x2::new(2, 2);
+        let c: u32x2 = u32x2::new(0, 3);
+        let e: u64x2 = u64x2::new(6, 7);
+        let r: u64x2 = transmute(vmlal_lane_u32::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vextq_f32() {
-        let a: f32x4 = f32x4::new(0., 2., 2., 3.);
-        let b: f32x4 = f32x4::new(3., 4., 5., 6.);
-        let e: f32x4 = f32x4::new(2., 3., 3., 4.);
-        let r: f32x4 = transmute(vextq_f32::<2>(transmute(a), transmute(b)));
+    unsafe fn test_vmlal_laneq_u32() {
+        let a: u64x2 = u64x2::new(0, 1);
+        let b: u32x2 = u32x2::new(2, 2);
+        let c: u32x4 = u32x4::new(0, 3, 0, 0);
+        let e: u64x2 = u64x2::new(6, 7);
+        let r: u64x2 = transmute(vmlal_laneq_u32::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmla_s8() {
-        let a: i8x8 = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+    unsafe fn test_vmls_s8() {
+        let a: i8x8 = i8x8::new(6, 7, 8, 9, 10, 11, 12, 13);
         let b: i8x8 = i8x8::new(2, 2, 2, 2, 2, 2, 2, 2);
         let c: i8x8 = i8x8::new(3, 3, 3, 3, 3, 3, 3, 3);
-        let e: i8x8 = i8x8::new(6, 7, 8, 9, 10, 11, 12, 13);
-        let r: i8x8 = transmute(vmla_s8(transmute(a), transmute(b), transmute(c)));
+        let e: i8x8 = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: i8x8 = transmute(vmls_s8(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlaq_s8() {
-        let a: i8x16 = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+    unsafe fn test_vmlsq_s8() {
+        let a: i8x16 = i8x16::new(6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21);
         let b: i8x16 = i8x16::new(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
         let c: i8x16 = i8x16::new(3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3);
-        let e: i8x16 = i8x16::new(6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21);
-        let r: i8x16 = transmute(vmlaq_s8(transmute(a), transmute(b), transmute(c)));
+        let e: i8x16 = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r: i8x16 = transmute(vmlsq_s8(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmla_s16() {
-        let a: i16x4 = i16x4::new(0, 1, 2, 3);
+    unsafe fn test_vmls_s16() {
+        let a: i16x4 = i16x4::new(6, 7, 8, 9);
         let b: i16x4 = i16x4::new(2, 2, 2, 2);
         let c: i16x4 = i16x4::new(3, 3, 3, 3);
-        let e: i16x4 = i16x4::new(6, 7, 8, 9);
-        let r: i16x4 = transmute(vmla_s16(transmute(a), transmute(b), transmute(c)));
+        let e: i16x4 = i16x4::new(0, 1, 2, 3);
+        let r: i16x4 = transmute(vmls_s16(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlaq_s16() {
-        let a: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+    unsafe fn test_vmlsq_s16() {
+        let a: i16x8 = i16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
         let b: i16x8 = i16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
         let c: i16x8 = i16x8::new(3, 3, 3, 3, 3, 3, 3, 3);
-        let e: i16x8 = i16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
-        let r: i16x8 = transmute(vmlaq_s16(transmute(a), transmute(b), transmute(c)));
+        let e: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: i16x8 = transmute(vmlsq_s16(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmla_s32() {
-        let a: i32x2 = i32x2::new(0, 1);
+    unsafe fn test_vmls_s32() {
+        let a: i32x2 = i32x2::new(6, 7);
         let b: i32x2 = i32x2::new(2, 2);
         let c: i32x2 = i32x2::new(3, 3);
-        let e: i32x2 = i32x2::new(6, 7);
-        let r: i32x2 = transmute(vmla_s32(transmute(a), transmute(b), transmute(c)));
+        let e: i32x2 = i32x2::new(0, 1);
+        let r: i32x2 = transmute(vmls_s32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlaq_s32() {
-        let a: i32x4 = i32x4::new(0, 1, 2, 3);
+    unsafe fn test_vmlsq_s32() {
+        let a: i32x4 = i32x4::new(6, 7, 8, 9);
         let b: i32x4 = i32x4::new(2, 2, 2, 2);
         let c: i32x4 = i32x4::new(3, 3, 3, 3);
-        let e: i32x4 = i32x4::new(6, 7, 8, 9);
-        let r: i32x4 = transmute(vmlaq_s32(transmute(a), transmute(b), transmute(c)));
+        let e: i32x4 = i32x4::new(0, 1, 2, 3);
+        let r: i32x4 = transmute(vmlsq_s32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmla_u8() {
-        let a: u8x8 = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+    unsafe fn test_vmls_u8() {
+        let a: u8x8 = u8x8::new(6, 7, 8, 9, 10, 11, 12, 13);
         let b: u8x8 = u8x8::new(2, 2, 2, 2, 2, 2, 2, 2);
         let c: u8x8 = u8x8::new(3, 3, 3, 3, 3, 3, 3, 3);
-        let e: u8x8 = u8x8::new(6, 7, 8, 9, 10, 11, 12, 13);
-        let r: u8x8 = transmute(vmla_u8(transmute(a), transmute(b), transmute(c)));
+        let e: u8x8 = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: u8x8 = transmute(vmls_u8(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlaq_u8() {
-        let a: u8x16 = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+    unsafe fn test_vmlsq_u8() {
+        let a: u8x16 = u8x16::new(6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21);
         let b: u8x16 = u8x16::new(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
         let c: u8x16 = u8x16::new(3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3);
-        let e: u8x16 = u8x16::new(6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21);
-        let r: u8x16 = transmute(vmlaq_u8(transmute(a), transmute(b), transmute(c)));
+        let e: u8x16 = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r: u8x16 = transmute(vmlsq_u8(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmla_u16() {
-        let a: u16x4 = u16x4::new(0, 1, 2, 3);
+    unsafe fn test_vmls_u16() {
+        let a: u16x4 = u16x4::new(6, 7, 8, 9);
         let b: u16x4 = u16x4::new(2, 2, 2, 2);
         let c: u16x4 = u16x4::new(3, 3, 3, 3);
-        let e: u16x4 = u16x4::new(6, 7, 8, 9);
-        let r: u16x4 = transmute(vmla_u16(transmute(a), transmute(b), transmute(c)));
+        let e: u16x4 = u16x4::new(0, 1, 2, 3);
+        let r: u16x4 = transmute(vmls_u16(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlaq_u16() {
-        let a: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+    unsafe fn test_vmlsq_u16() {
+        let a: u16x8 = u16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
         let b: u16x8 = u16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
         let c: u16x8 = u16x8::new(3, 3, 3, 3, 3, 3, 3, 3);
-        let e: u16x8 = u16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
-        let r: u16x8 = transmute(vmlaq_u16(transmute(a), transmute(b), transmute(c)));
+        let e: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: u16x8 = transmute(vmlsq_u16(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmla_u32() {
-        let a: u32x2 = u32x2::new(0, 1);
+    unsafe fn test_vmls_u32() {
+        let a: u32x2 = u32x2::new(6, 7);
         let b: u32x2 = u32x2::new(2, 2);
         let c: u32x2 = u32x2::new(3, 3);
-        let e: u32x2 = u32x2::new(6, 7);
-        let r: u32x2 = transmute(vmla_u32(transmute(a), transmute(b), transmute(c)));
+        let e: u32x2 = u32x2::new(0, 1);
+        let r: u32x2 = transmute(vmls_u32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlaq_u32() {
-        let a: u32x4 = u32x4::new(0, 1, 2, 3);
+    unsafe fn test_vmlsq_u32() {
+        let a: u32x4 = u32x4::new(6, 7, 8, 9);
         let b: u32x4 = u32x4::new(2, 2, 2, 2);
         let c: u32x4 = u32x4::new(3, 3, 3, 3);
-        let e: u32x4 = u32x4::new(6, 7, 8, 9);
-        let r: u32x4 = transmute(vmlaq_u32(transmute(a), transmute(b), transmute(c)));
+        let e: u32x4 = u32x4::new(0, 1, 2, 3);
+        let r: u32x4 = transmute(vmlsq_u32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmla_f32() {
-        let a: f32x2 = f32x2::new(0., 1.);
+    unsafe fn test_vmls_f32() {
+        let a: f32x2 = f32x2::new(6., 7.);
         let b: f32x2 = f32x2::new(2., 2.);
         let c: f32x2 = f32x2::new(3., 3.);
-        let e: f32x2 = f32x2::new(6., 7.);
-        let r: f32x2 = transmute(vmla_f32(transmute(a), transmute(b), transmute(c)));
+        let e: f32x2 = f32x2::new(0., 1.);
+        let r: f32x2 = transmute(vmls_f32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlaq_f32() {
-        let a: f32x4 = f32x4::new(0., 1., 2., 3.);
+    unsafe fn test_vmlsq_f32() {
+        let a: f32x4 = f32x4::new(6., 7., 8., 9.);
         let b: f32x4 = f32x4::new(2., 2., 2., 2.);
         let c: f32x4 = f32x4::new(3., 3., 3., 3.);
-        let e: f32x4 = f32x4::new(6., 7., 8., 9.);
-        let r: f32x4 = transmute(vmlaq_f32(transmute(a), transmute(b), transmute(c)));
+        let e: f32x4 = f32x4::new(0., 1., 2., 3.);
+        let r: f32x4 = transmute(vmlsq_f32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmla_n_s16() {
-        let a: i16x4 = i16x4::new(0, 1, 2, 3);
+    unsafe fn test_vmls_n_s16() {
+        let a: i16x4 = i16x4::new(6, 7, 8, 9);
         let b: i16x4 = i16x4::new(2, 2, 2, 2);
         let c: i16 = 3;
-        let e: i16x4 = i16x4::new(6, 7, 8, 9);
-        let r: i16x4 = transmute(vmla_n_s16(transmute(a), transmute(b), transmute(c)));
+        let e: i16x4 = i16x4::new(0, 1, 2, 3);
+        let r: i16x4 = transmute(vmls_n_s16(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlaq_n_s16() {
-        let a: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+    unsafe fn test_vmlsq_n_s16() {
+        let a: i16x8 = i16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
         let b: i16x8 = i16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
         let c: i16 = 3;
-        let e: i16x8 = i16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
-        let r: i16x8 = transmute(vmlaq_n_s16(transmute(a), transmute(b), transmute(c)));
+        let e: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: i16x8 = transmute(vmlsq_n_s16(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmla_n_s32() {
-        let a: i32x2 = i32x2::new(0, 1);
+    unsafe fn test_vmls_n_s32() {
+        let a: i32x2 = i32x2::new(6, 7);
         let b: i32x2 = i32x2::new(2, 2);
         let c: i32 = 3;
-        let e: i32x2 = i32x2::new(6, 7);
-        let r: i32x2 = transmute(vmla_n_s32(transmute(a), transmute(b), transmute(c)));
+        let e: i32x2 = i32x2::new(0, 1);
+        let r: i32x2 = transmute(vmls_n_s32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlaq_n_s32() {
-        let a: i32x4 = i32x4::new(0, 1, 2, 3);
+    unsafe fn test_vmlsq_n_s32() {
+        let a: i32x4 = i32x4::new(6, 7, 8, 9);
         let b: i32x4 = i32x4::new(2, 2, 2, 2);
         let c: i32 = 3;
-        let e: i32x4 = i32x4::new(6, 7, 8, 9);
-        let r: i32x4 = transmute(vmlaq_n_s32(transmute(a), transmute(b), transmute(c)));
+        let e: i32x4 = i32x4::new(0, 1, 2, 3);
+        let r: i32x4 = transmute(vmlsq_n_s32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmla_n_u16() {
-        let a: u16x4 = u16x4::new(0, 1, 2, 3);
+    unsafe fn test_vmls_n_u16() {
+        let a: u16x4 = u16x4::new(6, 7, 8, 9);
         let b: u16x4 = u16x4::new(2, 2, 2, 2);
         let c: u16 = 3;
-        let e: u16x4 = u16x4::new(6, 7, 8, 9);
-        let r: u16x4 = transmute(vmla_n_u16(transmute(a), transmute(b), transmute(c)));
+        let e: u16x4 = u16x4::new(0, 1, 2, 3);
+        let r: u16x4 = transmute(vmls_n_u16(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlaq_n_u16() {
-        let a: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+    unsafe fn test_vmlsq_n_u16() {
+        let a: u16x8 = u16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
         let b: u16x8 = u16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
         let c: u16 = 3;
-        let e: u16x8 = u16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
-        let r: u16x8 = transmute(vmlaq_n_u16(transmute(a), transmute(b), transmute(c)));
+        let e: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: u16x8 = transmute(vmlsq_n_u16(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmla_n_u32() {
-        let a: u32x2 = u32x2::new(0, 1);
+    unsafe fn test_vmls_n_u32() {
+        let a: u32x2 = u32x2::new(6, 7);
         let b: u32x2 = u32x2::new(2, 2);
         let c: u32 = 3;
-        let e: u32x2 = u32x2::new(6, 7);
-        let r: u32x2 = transmute(vmla_n_u32(transmute(a), transmute(b), transmute(c)));
+        let e: u32x2 = u32x2::new(0, 1);
+        let r: u32x2 = transmute(vmls_n_u32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlaq_n_u32() {
-        let a: u32x4 = u32x4::new(0, 1, 2, 3);
+    unsafe fn test_vmlsq_n_u32() {
+        let a: u32x4 = u32x4::new(6, 7, 8, 9);
         let b: u32x4 = u32x4::new(2, 2, 2, 2);
         let c: u32 = 3;
-        let e: u32x4 = u32x4::new(6, 7, 8, 9);
-        let r: u32x4 = transmute(vmlaq_n_u32(transmute(a), transmute(b), transmute(c)));
+        let e: u32x4 = u32x4::new(0, 1, 2, 3);
+        let r: u32x4 = transmute(vmlsq_n_u32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmla_n_f32() {
-        let a: f32x2 = f32x2::new(0., 1.);
+    unsafe fn test_vmls_n_f32() {
+        let a: f32x2 = f32x2::new(6., 7.);
         let b: f32x2 = f32x2::new(2., 2.);
         let c: f32 = 3.;
-        let e: f32x2 = f32x2::new(6., 7.);
-        let r: f32x2 = transmute(vmla_n_f32(transmute(a), transmute(b), transmute(c)));
+        let e: f32x2 = f32x2::new(0., 1.);
+        let r: f32x2 = transmute(vmls_n_f32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlaq_n_f32() {
-        let a: f32x4 = f32x4::new(0., 1., 2., 3.);
+    unsafe fn test_vmlsq_n_f32() {
+        let a: f32x4 = f32x4::new(6., 7., 8., 9.);
         let b: f32x4 = f32x4::new(2., 2., 2., 2.);
         let c: f32 = 3.;
-        let e: f32x4 = f32x4::new(6., 7., 8., 9.);
-        let r: f32x4 = transmute(vmlaq_n_f32(transmute(a), transmute(b), transmute(c)));
+        let e: f32x4 = f32x4::new(0., 1., 2., 3.);
+        let r: f32x4 = transmute(vmlsq_n_f32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmla_lane_s16() {
-        let a: i16x4 = i16x4::new(0, 1, 2, 3);
+    unsafe fn test_vmls_lane_s16() {
+        let a: i16x4 = i16x4::new(6, 7, 8, 9);
         let b: i16x4 = i16x4::new(2, 2, 2, 2);
         let c: i16x4 = i16x4::new(0, 3, 0, 0);
-        let e: i16x4 = i16x4::new(6, 7, 8, 9);
-        let r: i16x4 = transmute(vmla_lane_s16::<1>(transmute(a), transmute(b), transmute(c)));
+        let e: i16x4 = i16x4::new(0, 1, 2, 3);
+        let r: i16x4 = transmute(vmls_lane_s16::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmla_laneq_s16() {
-        let a: i16x4 = i16x4::new(0, 1, 2, 3);
+    unsafe fn test_vmls_laneq_s16() {
+        let a: i16x4 = i16x4::new(6, 7, 8, 9);
         let b: i16x4 = i16x4::new(2, 2, 2, 2);
         let c: i16x8 = i16x8::new(0, 3, 0, 0, 0, 0, 0, 0);
-        let e: i16x4 = i16x4::new(6, 7, 8, 9);
-        let r: i16x4 = transmute(vmla_laneq_s16::<1>(transmute(a), transmute(b), transmute(c)));
+        let e: i16x4 = i16x4::new(0, 1, 2, 3);
+        let r: i16x4 = transmute(vmls_laneq_s16::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlaq_lane_s16() {
-        let a: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+    unsafe fn test_vmlsq_lane_s16() {
+        let a: i16x8 = i16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
         let b: i16x8 = i16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
         let c: i16x4 = i16x4::new(0, 3, 0, 0);
-        let e: i16x8 = i16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
-        let r: i16x8 = transmute(vmlaq_lane_s16::<1>(transmute(a), transmute(b), transmute(c)));
+        let e: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: i16x8 = transmute(vmlsq_lane_s16::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlaq_laneq_s16() {
-        let a: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+    unsafe fn test_vmlsq_laneq_s16() {
+        let a: i16x8 = i16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
         let b: i16x8 = i16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
         let c: i16x8 = i16x8::new(0, 3, 0, 0, 0, 0, 0, 0);
-        let e: i16x8 = i16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
-        let r: i16x8 = transmute(vmlaq_laneq_s16::<1>(transmute(a), transmute(b), transmute(c)));
+        let e: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: i16x8 = transmute(vmlsq_laneq_s16::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmla_lane_s32() {
-        let a: i32x2 = i32x2::new(0, 1);
+    unsafe fn test_vmls_lane_s32() {
+        let a: i32x2 = i32x2::new(6, 7);
         let b: i32x2 = i32x2::new(2, 2);
         let c: i32x2 = i32x2::new(0, 3);
-        let e: i32x2 = i32x2::new(6, 7);
-        let r: i32x2 = transmute(vmla_lane_s32::<1>(transmute(a), transmute(b), transmute(c)));
+        let e: i32x2 = i32x2::new(0, 1);
+        let r: i32x2 = transmute(vmls_lane_s32::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmla_laneq_s32() {
-        let a: i32x2 = i32x2::new(0, 1);
+    unsafe fn test_vmls_laneq_s32() {
+        let a: i32x2 = i32x2::new(6, 7);
         let b: i32x2 = i32x2::new(2, 2);
         let c: i32x4 = i32x4::new(0, 3, 0, 0);
-        let e: i32x2 = i32x2::new(6, 7);
-        let r: i32x2 = transmute(vmla_laneq_s32::<1>(transmute(a), transmute(b), transmute(c)));
+        let e: i32x2 = i32x2::new(0, 1);
+        let r: i32x2 = transmute(vmls_laneq_s32::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlaq_lane_s32() {
-        let a: i32x4 = i32x4::new(0, 1, 2, 3);
+    unsafe fn test_vmlsq_lane_s32() {
+        let a: i32x4 = i32x4::new(6, 7, 8, 9);
         let b: i32x4 = i32x4::new(2, 2, 2, 2);
         let c: i32x2 = i32x2::new(0, 3);
-        let e: i32x4 = i32x4::new(6, 7, 8, 9);
-        let r: i32x4 = transmute(vmlaq_lane_s32::<1>(transmute(a), transmute(b), transmute(c)));
+        let e: i32x4 = i32x4::new(0, 1, 2, 3);
+        let r: i32x4 = transmute(vmlsq_lane_s32::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlaq_laneq_s32() {
-        let a: i32x4 = i32x4::new(0, 1, 2, 3);
+    unsafe fn test_vmlsq_laneq_s32() {
+        let a: i32x4 = i32x4::new(6, 7, 8, 9);
         let b: i32x4 = i32x4::new(2, 2, 2, 2);
         let c: i32x4 = i32x4::new(0, 3, 0, 0);
-        let e: i32x4 = i32x4::new(6, 7, 8, 9);
-        let r: i32x4 = transmute(vmlaq_laneq_s32::<1>(transmute(a), transmute(b), transmute(c)));
+        let e: i32x4 = i32x4::new(0, 1, 2, 3);
+        let r: i32x4 = transmute(vmlsq_laneq_s32::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmla_lane_u16() {
-        let a: u16x4 = u16x4::new(0, 1, 2, 3);
+    unsafe fn test_vmls_lane_u16() {
+        let a: u16x4 = u16x4::new(6, 7, 8, 9);
         let b: u16x4 = u16x4::new(2, 2, 2, 2);
         let c: u16x4 = u16x4::new(0, 3, 0, 0);
-        let e: u16x4 = u16x4::new(6, 7, 8, 9);
-        let r: u16x4 = transmute(vmla_lane_u16::<1>(transmute(a), transmute(b), transmute(c)));
+        let e: u16x4 = u16x4::new(0, 1, 2, 3);
+        let r: u16x4 = transmute(vmls_lane_u16::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmla_laneq_u16() {
-        let a: u16x4 = u16x4::new(0, 1, 2, 3);
+    unsafe fn test_vmls_laneq_u16() {
+        let a: u16x4 = u16x4::new(6, 7, 8, 9);
         let b: u16x4 = u16x4::new(2, 2, 2, 2);
         let c: u16x8 = u16x8::new(0, 3, 0, 0, 0, 0, 0, 0);
-        let e: u16x4 = u16x4::new(6, 7, 8, 9);
-        let r: u16x4 = transmute(vmla_laneq_u16::<1>(transmute(a), transmute(b), transmute(c)));
+        let e: u16x4 = u16x4::new(0, 1, 2, 3);
+        let r: u16x4 = transmute(vmls_laneq_u16::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlaq_lane_u16() {
-        let a: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+    unsafe fn test_vmlsq_lane_u16() {
+        let a: u16x8 = u16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
         let b: u16x8 = u16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
         let c: u16x4 = u16x4::new(0, 3, 0, 0);
-        let e: u16x8 = u16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
-        let r: u16x8 = transmute(vmlaq_lane_u16::<1>(transmute(a), transmute(b), transmute(c)));
+        let e: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: u16x8 = transmute(vmlsq_lane_u16::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlaq_laneq_u16() {
-        let a: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+    unsafe fn test_vmlsq_laneq_u16() {
+        let a: u16x8 = u16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
         let b: u16x8 = u16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
         let c: u16x8 = u16x8::new(0, 3, 0, 0, 0, 0, 0, 0);
-        let e: u16x8 = u16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
-        let r: u16x8 = transmute(vmlaq_laneq_u16::<1>(transmute(a), transmute(b), transmute(c)));
+        let e: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: u16x8 = transmute(vmlsq_laneq_u16::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmla_lane_u32() {
-        let a: u32x2 = u32x2::new(0, 1);
+    unsafe fn test_vmls_lane_u32() {
+        let a: u32x2 = u32x2::new(6, 7);
         let b: u32x2 = u32x2::new(2, 2);
         let c: u32x2 = u32x2::new(0, 3);
-        let e: u32x2 = u32x2::new(6, 7);
-        let r: u32x2 = transmute(vmla_lane_u32::<1>(transmute(a), transmute(b), transmute(c)));
+        let e: u32x2 = u32x2::new(0, 1);
+        let r: u32x2 = transmute(vmls_lane_u32::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmla_laneq_u32() {
-        let a: u32x2 = u32x2::new(0, 1);
+    unsafe fn test_vmls_laneq_u32() {
+        let a: u32x2 = u32x2::new(6, 7);
         let b: u32x2 = u32x2::new(2, 2);
         let c: u32x4 = u32x4::new(0, 3, 0, 0);
-        let e: u32x2 = u32x2::new(6, 7);
-        let r: u32x2 = transmute(vmla_laneq_u32::<1>(transmute(a), transmute(b), transmute(c)));
+        let e: u32x2 = u32x2::new(0, 1);
+        let r: u32x2 = transmute(vmls_laneq_u32::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlaq_lane_u32() {
-        let a: u32x4 = u32x4::new(0, 1, 2, 3);
+    unsafe fn test_vmlsq_lane_u32() {
+        let a: u32x4 = u32x4::new(6, 7, 8, 9);
         let b: u32x4 = u32x4::new(2, 2, 2, 2);
         let c: u32x2 = u32x2::new(0, 3);
-        let e: u32x4 = u32x4::new(6, 7, 8, 9);
-        let r: u32x4 = transmute(vmlaq_lane_u32::<1>(transmute(a), transmute(b), transmute(c)));
+        let e: u32x4 = u32x4::new(0, 1, 2, 3);
+        let r: u32x4 = transmute(vmlsq_lane_u32::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlaq_laneq_u32() {
-        let a: u32x4 = u32x4::new(0, 1, 2, 3);
+    unsafe fn test_vmlsq_laneq_u32() {
+        let a: u32x4 = u32x4::new(6, 7, 8, 9);
         let b: u32x4 = u32x4::new(2, 2, 2, 2);
         let c: u32x4 = u32x4::new(0, 3, 0, 0);
-        let e: u32x4 = u32x4::new(6, 7, 8, 9);
-        let r: u32x4 = transmute(vmlaq_laneq_u32::<1>(transmute(a), transmute(b), transmute(c)));
+        let e: u32x4 = u32x4::new(0, 1, 2, 3);
+        let r: u32x4 = transmute(vmlsq_laneq_u32::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmla_lane_f32() {
-        let a: f32x2 = f32x2::new(0., 1.);
+    unsafe fn test_vmls_lane_f32() {
+        let a: f32x2 = f32x2::new(6., 7.);
         let b: f32x2 = f32x2::new(2., 2.);
         let c: f32x2 = f32x2::new(0., 3.);
-        let e: f32x2 = f32x2::new(6., 7.);
-        let r: f32x2 = transmute(vmla_lane_f32::<1>(transmute(a), transmute(b), transmute(c)));
+        let e: f32x2 = f32x2::new(0., 1.);
+        let r: f32x2 = transmute(vmls_lane_f32::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmla_laneq_f32() {
-        let a: f32x2 = f32x2::new(0., 1.);
+    unsafe fn test_vmls_laneq_f32() {
+        let a: f32x2 = f32x2::new(6., 7.);
         let b: f32x2 = f32x2::new(2., 2.);
         let c: f32x4 = f32x4::new(0., 3., 0., 0.);
-        let e: f32x2 = f32x2::new(6., 7.);
-        let r: f32x2 = transmute(vmla_laneq_f32::<1>(transmute(a), transmute(b), transmute(c)));
+        let e: f32x2 = f32x2::new(0., 1.);
+        let r: f32x2 = transmute(vmls_laneq_f32::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlaq_lane_f32() {
-        let a: f32x4 = f32x4::new(0., 1., 2., 3.);
+    unsafe fn test_vmlsq_lane_f32() {
+        let a: f32x4 = f32x4::new(6., 7., 8., 9.);
         let b: f32x4 = f32x4::new(2., 2., 2., 2.);
         let c: f32x2 = f32x2::new(0., 3.);
-        let e: f32x4 = f32x4::new(6., 7., 8., 9.);
-        let r: f32x4 = transmute(vmlaq_lane_f32::<1>(transmute(a), transmute(b), transmute(c)));
+        let e: f32x4 = f32x4::new(0., 1., 2., 3.);
+        let r: f32x4 = transmute(vmlsq_lane_f32::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlaq_laneq_f32() {
-        let a: f32x4 = f32x4::new(0., 1., 2., 3.);
+    unsafe fn test_vmlsq_laneq_f32() {
+        let a: f32x4 = f32x4::new(6., 7., 8., 9.);
         let b: f32x4 = f32x4::new(2., 2., 2., 2.);
         let c: f32x4 = f32x4::new(0., 3., 0., 0.);
-        let e: f32x4 = f32x4::new(6., 7., 8., 9.);
-        let r: f32x4 = transmute(vmlaq_laneq_f32::<1>(transmute(a), transmute(b), transmute(c)));
+        let e: f32x4 = f32x4::new(0., 1., 2., 3.);
+        let r: f32x4 = transmute(vmlsq_laneq_f32::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlal_s8() {
-        let a: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+    unsafe fn test_vmlsl_s8() {
+        let a: i16x8 = i16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
         let b: i8x8 = i8x8::new(2, 2, 2, 2, 2, 2, 2, 2);
         let c: i8x8 = i8x8::new(3, 3, 3, 3, 3, 3, 3, 3);
-        let e: i16x8 = i16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
-        let r: i16x8 = transmute(vmlal_s8(transmute(a), transmute(b), transmute(c)));
+        let e: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: i16x8 = transmute(vmlsl_s8(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlal_s16() {
-        let a: i32x4 = i32x4::new(0, 1, 2, 3);
+    unsafe fn test_vmlsl_s16() {
+        let a: i32x4 = i32x4::new(6, 7, 8, 9);
         let b: i16x4 = i16x4::new(2, 2, 2, 2);
         let c: i16x4 = i16x4::new(3, 3, 3, 3);
-        let e: i32x4 = i32x4::new(6, 7, 8, 9);
-        let r: i32x4 = transmute(vmlal_s16(transmute(a), transmute(b), transmute(c)));
+        let e: i32x4 = i32x4::new(0, 1, 2, 3);
+        let r: i32x4 = transmute(vmlsl_s16(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlal_s32() {
-        let a: i64x2 = i64x2::new(0, 1);
+    unsafe fn test_vmlsl_s32() {
+        let a: i64x2 = i64x2::new(6, 7);
         let b: i32x2 = i32x2::new(2, 2);
         let c: i32x2 = i32x2::new(3, 3);
-        let e: i64x2 = i64x2::new(6, 7);
-        let r: i64x2 = transmute(vmlal_s32(transmute(a), transmute(b), transmute(c)));
+        let e: i64x2 = i64x2::new(0, 1);
+        let r: i64x2 = transmute(vmlsl_s32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlal_u8() {
-        let a: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+    unsafe fn test_vmlsl_u8() {
+        let a: u16x8 = u16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
         let b: u8x8 = u8x8::new(2, 2, 2, 2, 2, 2, 2, 2);
         let c: u8x8 = u8x8::new(3, 3, 3, 3, 3, 3, 3, 3);
-        let e: u16x8 = u16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
-        let r: u16x8 = transmute(vmlal_u8(transmute(a), transmute(b), transmute(c)));
+        let e: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: u16x8 = transmute(vmlsl_u8(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlal_u16() {
-        let a: u32x4 = u32x4::new(0, 1, 2, 3);
+    unsafe fn test_vmlsl_u16() {
+        let a: u32x4 = u32x4::new(6, 7, 8, 9);
         let b: u16x4 = u16x4::new(2, 2, 2, 2);
         let c: u16x4 = u16x4::new(3, 3, 3, 3);
-        let e: u32x4 = u32x4::new(6, 7, 8, 9);
-        let r: u32x4 = transmute(vmlal_u16(transmute(a), transmute(b), transmute(c)));
+        let e: u32x4 = u32x4::new(0, 1, 2, 3);
+        let r: u32x4 = transmute(vmlsl_u16(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlal_u32() {
-        let a: u64x2 = u64x2::new(0, 1);
+    unsafe fn test_vmlsl_u32() {
+        let a: u64x2 = u64x2::new(6, 7);
         let b: u32x2 = u32x2::new(2, 2);
         let c: u32x2 = u32x2::new(3, 3);
-        let e: u64x2 = u64x2::new(6, 7);
-        let r: u64x2 = transmute(vmlal_u32(transmute(a), transmute(b), transmute(c)));
+        let e: u64x2 = u64x2::new(0, 1);
+        let r: u64x2 = transmute(vmlsl_u32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlal_n_s16() {
-        let a: i32x4 = i32x4::new(0, 1, 2, 3);
+    unsafe fn test_vmlsl_n_s16() {
+        let a: i32x4 = i32x4::new(6, 7, 8, 9);
         let b: i16x4 = i16x4::new(2, 2, 2, 2);
         let c: i16 = 3;
-        let e: i32x4 = i32x4::new(6, 7, 8, 9);
-        let r: i32x4 = transmute(vmlal_n_s16(transmute(a), transmute(b), transmute(c)));
+        let e: i32x4 = i32x4::new(0, 1, 2, 3);
+        let r: i32x4 = transmute(vmlsl_n_s16(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlal_n_s32() {
-        let a: i64x2 = i64x2::new(0, 1);
+    unsafe fn test_vmlsl_n_s32() {
+        let a: i64x2 = i64x2::new(6, 7);
         let b: i32x2 = i32x2::new(2, 2);
         let c: i32 = 3;
-        let e: i64x2 = i64x2::new(6, 7);
-        let r: i64x2 = transmute(vmlal_n_s32(transmute(a), transmute(b), transmute(c)));
+        let e: i64x2 = i64x2::new(0, 1);
+        let r: i64x2 = transmute(vmlsl_n_s32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlal_n_u16() {
-        let a: u32x4 = u32x4::new(0, 1, 2, 3);
+    unsafe fn test_vmlsl_n_u16() {
+        let a: u32x4 = u32x4::new(6, 7, 8, 9);
         let b: u16x4 = u16x4::new(2, 2, 2, 2);
         let c: u16 = 3;
-        let e: u32x4 = u32x4::new(6, 7, 8, 9);
-        let r: u32x4 = transmute(vmlal_n_u16(transmute(a), transmute(b), transmute(c)));
+        let e: u32x4 = u32x4::new(0, 1, 2, 3);
+        let r: u32x4 = transmute(vmlsl_n_u16(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlal_n_u32() {
-        let a: u64x2 = u64x2::new(0, 1);
+    unsafe fn test_vmlsl_n_u32() {
+        let a: u64x2 = u64x2::new(6, 7);
         let b: u32x2 = u32x2::new(2, 2);
         let c: u32 = 3;
-        let e: u64x2 = u64x2::new(6, 7);
-        let r: u64x2 = transmute(vmlal_n_u32(transmute(a), transmute(b), transmute(c)));
+        let e: u64x2 = u64x2::new(0, 1);
+        let r: u64x2 = transmute(vmlsl_n_u32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlal_lane_s16() {
-        let a: i32x4 = i32x4::new(0, 1, 2, 3);
+    unsafe fn test_vmlsl_lane_s16() {
+        let a: i32x4 = i32x4::new(6, 7, 8, 9);
         let b: i16x4 = i16x4::new(2, 2, 2, 2);
         let c: i16x4 = i16x4::new(0, 3, 0, 0);
-        let e: i32x4 = i32x4::new(6, 7, 8, 9);
-        let r: i32x4 = transmute(vmlal_lane_s16::<1>(transmute(a), transmute(b), transmute(c)));
+        let e: i32x4 = i32x4::new(0, 1, 2, 3);
+        let r: i32x4 = transmute(vmlsl_lane_s16::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlal_laneq_s16() {
-        let a: i32x4 = i32x4::new(0, 1, 2, 3);
+    unsafe fn test_vmlsl_laneq_s16() {
+        let a: i32x4 = i32x4::new(6, 7, 8, 9);
         let b: i16x4 = i16x4::new(2, 2, 2, 2);
         let c: i16x8 = i16x8::new(0, 3, 0, 0, 0, 0, 0, 0);
-        let e: i32x4 = i32x4::new(6, 7, 8, 9);
-        let r: i32x4 = transmute(vmlal_laneq_s16::<1>(transmute(a), transmute(b), transmute(c)));
+        let e: i32x4 = i32x4::new(0, 1, 2, 3);
+        let r: i32x4 = transmute(vmlsl_laneq_s16::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlal_lane_s32() {
-        let a: i64x2 = i64x2::new(0, 1);
+    unsafe fn test_vmlsl_lane_s32() {
+        let a: i64x2 = i64x2::new(6, 7);
         let b: i32x2 = i32x2::new(2, 2);
         let c: i32x2 = i32x2::new(0, 3);
-        let e: i64x2 = i64x2::new(6, 7);
-        let r: i64x2 = transmute(vmlal_lane_s32::<1>(transmute(a), transmute(b), transmute(c)));
+        let e: i64x2 = i64x2::new(0, 1);
+        let r: i64x2 = transmute(vmlsl_lane_s32::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlal_laneq_s32() {
-        let a: i64x2 = i64x2::new(0, 1);
+    unsafe fn test_vmlsl_laneq_s32() {
+        let a: i64x2 = i64x2::new(6, 7);
         let b: i32x2 = i32x2::new(2, 2);
         let c: i32x4 = i32x4::new(0, 3, 0, 0);
-        let e: i64x2 = i64x2::new(6, 7);
-        let r: i64x2 = transmute(vmlal_laneq_s32::<1>(transmute(a), transmute(b), transmute(c)));
+        let e: i64x2 = i64x2::new(0, 1);
+        let r: i64x2 = transmute(vmlsl_laneq_s32::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlal_lane_u16() {
-        let a: u32x4 = u32x4::new(0, 1, 2, 3);
+    unsafe fn test_vmlsl_lane_u16() {
+        let a: u32x4 = u32x4::new(6, 7, 8, 9);
         let b: u16x4 = u16x4::new(2, 2, 2, 2);
         let c: u16x4 = u16x4::new(0, 3, 0, 0);
-        let e: u32x4 = u32x4::new(6, 7, 8, 9);
-        let r: u32x4 = transmute(vmlal_lane_u16::<1>(transmute(a), transmute(b), transmute(c)));
+        let e: u32x4 = u32x4::new(0, 1, 2, 3);
+        let r: u32x4 = transmute(vmlsl_lane_u16::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlal_laneq_u16() {
-        let a: u32x4 = u32x4::new(0, 1, 2, 3);
+    unsafe fn test_vmlsl_laneq_u16() {
+        let a: u32x4 = u32x4::new(6, 7, 8, 9);
         let b: u16x4 = u16x4::new(2, 2, 2, 2);
         let c: u16x8 = u16x8::new(0, 3, 0, 0, 0, 0, 0, 0);
-        let e: u32x4 = u32x4::new(6, 7, 8, 9);
-        let r: u32x4 = transmute(vmlal_laneq_u16::<1>(transmute(a), transmute(b), transmute(c)));
+        let e: u32x4 = u32x4::new(0, 1, 2, 3);
+        let r: u32x4 = transmute(vmlsl_laneq_u16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_lane_u32() {
+        let a: u64x2 = u64x2::new(6, 7);
+        let b: u32x2 = u32x2::new(2, 2);
+        let c: u32x2 = u32x2::new(0, 3);
+        let e: u64x2 = u64x2::new(0, 1);
+        let r: u64x2 = transmute(vmlsl_lane_u32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_laneq_u32() {
+        let a: u64x2 = u64x2::new(6, 7);
+        let b: u32x2 = u32x2::new(2, 2);
+        let c: u32x4 = u32x4::new(0, 3, 0, 0);
+        let e: u64x2 = u64x2::new(0, 1);
+        let r: u64x2 = transmute(vmlsl_laneq_u32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vneg_s8() {
+        let a: i8x8 = i8x8::new(0, 1, -1, 2, -2, 3, -3, 4);
+        let e: i8x8 = i8x8::new(0, -1, 1, -2, 2, -3, 3, -4);
+        let r: i8x8 = transmute(vneg_s8(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlal_lane_u32() {
-        let a: u64x2 = u64x2::new(0, 1);
-        let b: u32x2 = u32x2::new(2, 2);
-        let c: u32x2 = u32x2::new(0, 3);
-        let e: u64x2 = u64x2::new(6, 7);
-        let r: u64x2 = transmute(vmlal_lane_u32::<1>(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vnegq_s8() {
+        let a: i8x16 = i8x16::new(0, 1, -1, 2, -2, 3, -3, 4, -4, 5, -5, 6, -6, 7, -7, 8);
+        let e: i8x16 = i8x16::new(0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7, -8);
+        let r: i8x16 = transmute(vnegq_s8(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlal_laneq_u32() {
-        let a: u64x2 = u64x2::new(0, 1);
-        let b: u32x2 = u32x2::new(2, 2);
-        let c: u32x4 = u32x4::new(0, 3, 0, 0);
-        let e: u64x2 = u64x2::new(6, 7);
-        let r: u64x2 = transmute(vmlal_laneq_u32::<1>(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vneg_s16() {
+        let a: i16x4 = i16x4::new(0, 1, -1, 2);
+        let e: i16x4 = i16x4::new(0, -1, 1, -2);
+        let r: i16x4 = transmute(vneg_s16(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmls_s8() {
-        let a: i8x8 = i8x8::new(6, 7, 8, 9, 10, 11, 12, 13);
-        let b: i8x8 = i8x8::new(2, 2, 2, 2, 2, 2, 2, 2);
-        let c: i8x8 = i8x8::new(3, 3, 3, 3, 3, 3, 3, 3);
-        let e: i8x8 = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
-        let r: i8x8 = transmute(vmls_s8(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vnegq_s16() {
+        let a: i16x8 = i16x8::new(0, 1, -1, 2, -2, 3, -3, 4);
+        let e: i16x8 = i16x8::new(0, -1, 1, -2, 2, -3, 3, -4);
+        let r: i16x8 = transmute(vnegq_s16(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlsq_s8() {
-        let a: i8x16 = i8x16::new(6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21);
-        let b: i8x16 = i8x16::new(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
-        let c: i8x16 = i8x16::new(3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3);
-        let e: i8x16 = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r: i8x16 = transmute(vmlsq_s8(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vneg_s32() {
+        let a: i32x2 = i32x2::new(0, 1);
+        let e: i32x2 = i32x2::new(0, -1);
+        let r: i32x2 = transmute(vneg_s32(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmls_s16() {
-        let a: i16x4 = i16x4::new(6, 7, 8, 9);
-        let b: i16x4 = i16x4::new(2, 2, 2, 2);
-        let c: i16x4 = i16x4::new(3, 3, 3, 3);
-        let e: i16x4 = i16x4::new(0, 1, 2, 3);
-        let r: i16x4 = transmute(vmls_s16(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vnegq_s32() {
+        let a: i32x4 = i32x4::new(0, 1, -1, 2);
+        let e: i32x4 = i32x4::new(0, -1, 1, -2);
+        let r: i32x4 = transmute(vnegq_s32(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlsq_s16() {
-        let a: i16x8 = i16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
-        let b: i16x8 = i16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
-        let c: i16x8 = i16x8::new(3, 3, 3, 3, 3, 3, 3, 3);
-        let e: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
-        let r: i16x8 = transmute(vmlsq_s16(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vneg_f32() {
+        let a: f32x2 = f32x2::new(0., 1.);
+        let e: f32x2 = f32x2::new(0., -1.);
+        let r: f32x2 = transmute(vneg_f32(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmls_s32() {
-        let a: i32x2 = i32x2::new(6, 7);
-        let b: i32x2 = i32x2::new(2, 2);
-        let c: i32x2 = i32x2::new(3, 3);
-        let e: i32x2 = i32x2::new(0, 1);
-        let r: i32x2 = transmute(vmls_s32(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vnegq_f32() {
+        let a: f32x4 = f32x4::new(0., 1., -1., 2.);
+        let e: f32x4 = f32x4::new(0., -1., 1., -2.);
+        let r: f32x4 = transmute(vnegq_f32(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlsq_s32() {
-        let a: i32x4 = i32x4::new(6, 7, 8, 9);
-        let b: i32x4 = i32x4::new(2, 2, 2, 2);
-        let c: i32x4 = i32x4::new(3, 3, 3, 3);
-        let e: i32x4 = i32x4::new(0, 1, 2, 3);
-        let r: i32x4 = transmute(vmlsq_s32(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vqneg_s8() {
+        let a: i8x8 = i8x8::new(-128, 0, 1, -1, 2, -2, 3, -3);
+        let e: i8x8 = i8x8::new(0x7F, 0, -1, 1, -2, 2, -3, 3);
+        let r: i8x8 = transmute(vqneg_s8(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmls_u8() {
-        let a: u8x8 = u8x8::new(6, 7, 8, 9, 10, 11, 12, 13);
-        let b: u8x8 = u8x8::new(2, 2, 2, 2, 2, 2, 2, 2);
-        let c: u8x8 = u8x8::new(3, 3, 3, 3, 3, 3, 3, 3);
-        let e: u8x8 = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
-        let r: u8x8 = transmute(vmls_u8(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vqnegq_s8() {
+        let a: i8x16 = i8x16::new(-128, 0, 1, -1, 2, -2, 3, -3, 4, -4, 5, -5, 6, -6, 7, -7);
+        let e: i8x16 = i8x16::new(0x7F, 0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7);
+        let r: i8x16 = transmute(vqnegq_s8(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlsq_u8() {
-        let a: u8x16 = u8x16::new(6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21);
-        let b: u8x16 = u8x16::new(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
-        let c: u8x16 = u8x16::new(3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3);
-        let e: u8x16 = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r: u8x16 = transmute(vmlsq_u8(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vqneg_s16() {
+        let a: i16x4 = i16x4::new(-32768, 0, 1, -1);
+        let e: i16x4 = i16x4::new(0x7F_FF, 0, -1, 1);
+        let r: i16x4 = transmute(vqneg_s16(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmls_u16() {
-        let a: u16x4 = u16x4::new(6, 7, 8, 9);
-        let b: u16x4 = u16x4::new(2, 2, 2, 2);
-        let c: u16x4 = u16x4::new(3, 3, 3, 3);
-        let e: u16x4 = u16x4::new(0, 1, 2, 3);
-        let r: u16x4 = transmute(vmls_u16(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vqnegq_s16() {
+        let a: i16x8 = i16x8::new(-32768, 0, 1, -1, 2, -2, 3, -3);
+        let e: i16x8 = i16x8::new(0x7F_FF, 0, -1, 1, -2, 2, -3, 3);
+        let r: i16x8 = transmute(vqnegq_s16(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlsq_u16() {
-        let a: u16x8 = u16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
-        let b: u16x8 = u16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
-        let c: u16x8 = u16x8::new(3, 3, 3, 3, 3, 3, 3, 3);
-        let e: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
-        let r: u16x8 = transmute(vmlsq_u16(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vqneg_s32() {
+        let a: i32x2 = i32x2::new(-2147483648, 0);
+        let e: i32x2 = i32x2::new(0x7F_FF_FF_FF, 0);
+        let r: i32x2 = transmute(vqneg_s32(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmls_u32() {
-        let a: u32x2 = u32x2::new(6, 7);
-        let b: u32x2 = u32x2::new(2, 2);
-        let c: u32x2 = u32x2::new(3, 3);
-        let e: u32x2 = u32x2::new(0, 1);
-        let r: u32x2 = transmute(vmls_u32(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vqnegq_s32() {
+        let a: i32x4 = i32x4::new(-2147483648, 0, 1, -1);
+        let e: i32x4 = i32x4::new(0x7F_FF_FF_FF, 0, -1, 1);
+        let r: i32x4 = transmute(vqnegq_s32(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlsq_u32() {
-        let a: u32x4 = u32x4::new(6, 7, 8, 9);
-        let b: u32x4 = u32x4::new(2, 2, 2, 2);
-        let c: u32x4 = u32x4::new(3, 3, 3, 3);
-        let e: u32x4 = u32x4::new(0, 1, 2, 3);
-        let r: u32x4 = transmute(vmlsq_u32(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vqsub_u8() {
+        let a: u8x8 = u8x8::new(42, 42, 42, 42, 42, 42, 42, 42);
+        let b: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: u8x8 = u8x8::new(41, 40, 39, 38, 37, 36, 35, 34);
+        let r: u8x8 = transmute(vqsub_u8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmls_f32() {
-        let a: f32x2 = f32x2::new(6., 7.);
-        let b: f32x2 = f32x2::new(2., 2.);
-        let c: f32x2 = f32x2::new(3., 3.);
-        let e: f32x2 = f32x2::new(0., 1.);
-        let r: f32x2 = transmute(vmls_f32(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vqsubq_u8() {
+        let a: u8x16 = u8x16::new(42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42);
+        let b: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let e: u8x16 = u8x16::new(41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26);
+        let r: u8x16 = transmute(vqsubq_u8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlsq_f32() {
-        let a: f32x4 = f32x4::new(6., 7., 8., 9.);
-        let b: f32x4 = f32x4::new(2., 2., 2., 2.);
-        let c: f32x4 = f32x4::new(3., 3., 3., 3.);
-        let e: f32x4 = f32x4::new(0., 1., 2., 3.);
-        let r: f32x4 = transmute(vmlsq_f32(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vqsub_u16() {
+        let a: u16x4 = u16x4::new(42, 42, 42, 42);
+        let b: u16x4 = u16x4::new(1, 2, 3, 4);
+        let e: u16x4 = u16x4::new(41, 40, 39, 38);
+        let r: u16x4 = transmute(vqsub_u16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmls_n_s16() {
-        let a: i16x4 = i16x4::new(6, 7, 8, 9);
-        let b: i16x4 = i16x4::new(2, 2, 2, 2);
-        let c: i16 = 3;
-        let e: i16x4 = i16x4::new(0, 1, 2, 3);
-        let r: i16x4 = transmute(vmls_n_s16(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vqsubq_u16() {
+        let a: u16x8 = u16x8::new(42, 42, 42, 42, 42, 42, 42, 42);
+        let b: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: u16x8 = u16x8::new(41, 40, 39, 38, 37, 36, 35, 34);
+        let r: u16x8 = transmute(vqsubq_u16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlsq_n_s16() {
-        let a: i16x8 = i16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
-        let b: i16x8 = i16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
-        let c: i16 = 3;
-        let e: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
-        let r: i16x8 = transmute(vmlsq_n_s16(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vqsub_u32() {
+        let a: u32x2 = u32x2::new(42, 42);
+        let b: u32x2 = u32x2::new(1, 2);
+        let e: u32x2 = u32x2::new(41, 40);
+        let r: u32x2 = transmute(vqsub_u32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmls_n_s32() {
-        let a: i32x2 = i32x2::new(6, 7);
-        let b: i32x2 = i32x2::new(2, 2);
-        let c: i32 = 3;
-        let e: i32x2 = i32x2::new(0, 1);
-        let r: i32x2 = transmute(vmls_n_s32(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vqsubq_u32() {
+        let a: u32x4 = u32x4::new(42, 42, 42, 42);
+        let b: u32x4 = u32x4::new(1, 2, 3, 4);
+        let e: u32x4 = u32x4::new(41, 40, 39, 38);
+        let r: u32x4 = transmute(vqsubq_u32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlsq_n_s32() {
-        let a: i32x4 = i32x4::new(6, 7, 8, 9);
-        let b: i32x4 = i32x4::new(2, 2, 2, 2);
-        let c: i32 = 3;
-        let e: i32x4 = i32x4::new(0, 1, 2, 3);
-        let r: i32x4 = transmute(vmlsq_n_s32(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vqsub_u64() {
+        let a: u64x1 = u64x1::new(42);
+        let b: u64x1 = u64x1::new(1);
+        let e: u64x1 = u64x1::new(41);
+        let r: u64x1 = transmute(vqsub_u64(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmls_n_u16() {
-        let a: u16x4 = u16x4::new(6, 7, 8, 9);
-        let b: u16x4 = u16x4::new(2, 2, 2, 2);
-        let c: u16 = 3;
-        let e: u16x4 = u16x4::new(0, 1, 2, 3);
-        let r: u16x4 = transmute(vmls_n_u16(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vqsubq_u64() {
+        let a: u64x2 = u64x2::new(42, 42);
+        let b: u64x2 = u64x2::new(1, 2);
+        let e: u64x2 = u64x2::new(41, 40);
+        let r: u64x2 = transmute(vqsubq_u64(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlsq_n_u16() {
-        let a: u16x8 = u16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
-        let b: u16x8 = u16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
-        let c: u16 = 3;
-        let e: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
-        let r: u16x8 = transmute(vmlsq_n_u16(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vqsub_s8() {
+        let a: i8x8 = i8x8::new(42, 42, 42, 42, 42, 42, 42, 42);
+        let b: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: i8x8 = i8x8::new(41, 40, 39, 38, 37, 36, 35, 34);
+        let r: i8x8 = transmute(vqsub_s8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmls_n_u32() {
-        let a: u32x2 = u32x2::new(6, 7);
-        let b: u32x2 = u32x2::new(2, 2);
-        let c: u32 = 3;
-        let e: u32x2 = u32x2::new(0, 1);
-        let r: u32x2 = transmute(vmls_n_u32(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vqsubq_s8() {
+        let a: i8x16 = i8x16::new(42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42);
+        let b: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let e: i8x16 = i8x16::new(41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26);
+        let r: i8x16 = transmute(vqsubq_s8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlsq_n_u32() {
-        let a: u32x4 = u32x4::new(6, 7, 8, 9);
-        let b: u32x4 = u32x4::new(2, 2, 2, 2);
-        let c: u32 = 3;
-        let e: u32x4 = u32x4::new(0, 1, 2, 3);
-        let r: u32x4 = transmute(vmlsq_n_u32(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vqsub_s16() {
+        let a: i16x4 = i16x4::new(42, 42, 42, 42);
+        let b: i16x4 = i16x4::new(1, 2, 3, 4);
+        let e: i16x4 = i16x4::new(41, 40, 39, 38);
+        let r: i16x4 = transmute(vqsub_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqsubq_s16() {
+        let a: i16x8 = i16x8::new(42, 42, 42, 42, 42, 42, 42, 42);
+        let b: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: i16x8 = i16x8::new(41, 40, 39, 38, 37, 36, 35, 34);
+        let r: i16x8 = transmute(vqsubq_s16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmls_n_f32() {
-        let a: f32x2 = f32x2::new(6., 7.);
-        let b: f32x2 = f32x2::new(2., 2.);
-        let c: f32 = 3.;
-        let e: f32x2 = f32x2::new(0., 1.);
-        let r: f32x2 = transmute(vmls_n_f32(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vqsub_s32() {
+        let a: i32x2 = i32x2::new(42, 42);
+        let b: i32x2 = i32x2::new(1, 2);
+        let e: i32x2 = i32x2::new(41, 40);
+        let r: i32x2 = transmute(vqsub_s32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlsq_n_f32() {
-        let a: f32x4 = f32x4::new(6., 7., 8., 9.);
-        let b: f32x4 = f32x4::new(2., 2., 2., 2.);
-        let c: f32 = 3.;
-        let e: f32x4 = f32x4::new(0., 1., 2., 3.);
-        let r: f32x4 = transmute(vmlsq_n_f32(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vqsubq_s32() {
+        let a: i32x4 = i32x4::new(42, 42, 42, 42);
+        let b: i32x4 = i32x4::new(1, 2, 3, 4);
+        let e: i32x4 = i32x4::new(41, 40, 39, 38);
+        let r: i32x4 = transmute(vqsubq_s32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmls_lane_s16() {
-        let a: i16x4 = i16x4::new(6, 7, 8, 9);
-        let b: i16x4 = i16x4::new(2, 2, 2, 2);
-        let c: i16x4 = i16x4::new(0, 3, 0, 0);
-        let e: i16x4 = i16x4::new(0, 1, 2, 3);
-        let r: i16x4 = transmute(vmls_lane_s16::<1>(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vqsub_s64() {
+        let a: i64x1 = i64x1::new(42);
+        let b: i64x1 = i64x1::new(1);
+        let e: i64x1 = i64x1::new(41);
+        let r: i64x1 = transmute(vqsub_s64(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmls_laneq_s16() {
-        let a: i16x4 = i16x4::new(6, 7, 8, 9);
-        let b: i16x4 = i16x4::new(2, 2, 2, 2);
-        let c: i16x8 = i16x8::new(0, 3, 0, 0, 0, 0, 0, 0);
-        let e: i16x4 = i16x4::new(0, 1, 2, 3);
-        let r: i16x4 = transmute(vmls_laneq_s16::<1>(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vqsubq_s64() {
+        let a: i64x2 = i64x2::new(42, 42);
+        let b: i64x2 = i64x2::new(1, 2);
+        let e: i64x2 = i64x2::new(41, 40);
+        let r: i64x2 = transmute(vqsubq_s64(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlsq_lane_s16() {
-        let a: i16x8 = i16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
-        let b: i16x8 = i16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
-        let c: i16x4 = i16x4::new(0, 3, 0, 0);
-        let e: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
-        let r: i16x8 = transmute(vmlsq_lane_s16::<1>(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vhadd_u8() {
+        let a: u8x8 = u8x8::new(42, 42, 42, 42, 42, 42, 42, 42);
+        let b: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: u8x8 = u8x8::new(21, 22, 22, 23, 23, 24, 24, 25);
+        let r: u8x8 = transmute(vhadd_u8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlsq_laneq_s16() {
-        let a: i16x8 = i16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
-        let b: i16x8 = i16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
-        let c: i16x8 = i16x8::new(0, 3, 0, 0, 0, 0, 0, 0);
-        let e: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
-        let r: i16x8 = transmute(vmlsq_laneq_s16::<1>(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vhaddq_u8() {
+        let a: u8x16 = u8x16::new(42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42);
+        let b: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let e: u8x16 = u8x16::new(21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29);
+        let r: u8x16 = transmute(vhaddq_u8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmls_lane_s32() {
-        let a: i32x2 = i32x2::new(6, 7);
-        let b: i32x2 = i32x2::new(2, 2);
-        let c: i32x2 = i32x2::new(0, 3);
-        let e: i32x2 = i32x2::new(0, 1);
-        let r: i32x2 = transmute(vmls_lane_s32::<1>(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vhadd_u16() {
+        let a: u16x4 = u16x4::new(42, 42, 42, 42);
+        let b: u16x4 = u16x4::new(1, 2, 3, 4);
+        let e: u16x4 = u16x4::new(21, 22, 22, 23);
+        let r: u16x4 = transmute(vhadd_u16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmls_laneq_s32() {
-        let a: i32x2 = i32x2::new(6, 7);
-        let b: i32x2 = i32x2::new(2, 2);
-        let c: i32x4 = i32x4::new(0, 3, 0, 0);
-        let e: i32x2 = i32x2::new(0, 1);
-        let r: i32x2 = transmute(vmls_laneq_s32::<1>(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vhaddq_u16() {
+        let a: u16x8 = u16x8::new(42, 42, 42, 42, 42, 42, 42, 42);
+        let b: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: u16x8 = u16x8::new(21, 22, 22, 23, 23, 24, 24, 25);
+        let r: u16x8 = transmute(vhaddq_u16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlsq_lane_s32() {
-        let a: i32x4 = i32x4::new(6, 7, 8, 9);
-        let b: i32x4 = i32x4::new(2, 2, 2, 2);
-        let c: i32x2 = i32x2::new(0, 3);
-        let e: i32x4 = i32x4::new(0, 1, 2, 3);
-        let r: i32x4 = transmute(vmlsq_lane_s32::<1>(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vhadd_u32() {
+        let a: u32x2 = u32x2::new(42, 42);
+        let b: u32x2 = u32x2::new(1, 2);
+        let e: u32x2 = u32x2::new(21, 22);
+        let r: u32x2 = transmute(vhadd_u32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlsq_laneq_s32() {
-        let a: i32x4 = i32x4::new(6, 7, 8, 9);
-        let b: i32x4 = i32x4::new(2, 2, 2, 2);
-        let c: i32x4 = i32x4::new(0, 3, 0, 0);
-        let e: i32x4 = i32x4::new(0, 1, 2, 3);
-        let r: i32x4 = transmute(vmlsq_laneq_s32::<1>(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vhaddq_u32() {
+        let a: u32x4 = u32x4::new(42, 42, 42, 42);
+        let b: u32x4 = u32x4::new(1, 2, 3, 4);
+        let e: u32x4 = u32x4::new(21, 22, 22, 23);
+        let r: u32x4 = transmute(vhaddq_u32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmls_lane_u16() {
-        let a: u16x4 = u16x4::new(6, 7, 8, 9);
-        let b: u16x4 = u16x4::new(2, 2, 2, 2);
-        let c: u16x4 = u16x4::new(0, 3, 0, 0);
-        let e: u16x4 = u16x4::new(0, 1, 2, 3);
-        let r: u16x4 = transmute(vmls_lane_u16::<1>(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vhadd_s8() {
+        let a: i8x8 = i8x8::new(42, 42, 42, 42, 42, 42, 42, 42);
+        let b: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: i8x8 = i8x8::new(21, 22, 22, 23, 23, 24, 24, 25);
+        let r: i8x8 = transmute(vhadd_s8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmls_laneq_u16() {
-        let a: u16x4 = u16x4::new(6, 7, 8, 9);
-        let b: u16x4 = u16x4::new(2, 2, 2, 2);
-        let c: u16x8 = u16x8::new(0, 3, 0, 0, 0, 0, 0, 0);
-        let e: u16x4 = u16x4::new(0, 1, 2, 3);
-        let r: u16x4 = transmute(vmls_laneq_u16::<1>(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vhaddq_s8() {
+        let a: i8x16 = i8x16::new(42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42);
+        let b: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let e: i8x16 = i8x16::new(21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29);
+        let r: i8x16 = transmute(vhaddq_s8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlsq_lane_u16() {
-        let a: u16x8 = u16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
-        let b: u16x8 = u16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
-        let c: u16x4 = u16x4::new(0, 3, 0, 0);
-        let e: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
-        let r: u16x8 = transmute(vmlsq_lane_u16::<1>(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vhadd_s16() {
+        let a: i16x4 = i16x4::new(42, 42, 42, 42);
+        let b: i16x4 = i16x4::new(1, 2, 3, 4);
+        let e: i16x4 = i16x4::new(21, 22, 22, 23);
+        let r: i16x4 = transmute(vhadd_s16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlsq_laneq_u16() {
-        let a: u16x8 = u16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
-        let b: u16x8 = u16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
-        let c: u16x8 = u16x8::new(0, 3, 0, 0, 0, 0, 0, 0);
-        let e: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
-        let r: u16x8 = transmute(vmlsq_laneq_u16::<1>(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vhaddq_s16() {
+        let a: i16x8 = i16x8::new(42, 42, 42, 42, 42, 42, 42, 42);
+        let b: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: i16x8 = i16x8::new(21, 22, 22, 23, 23, 24, 24, 25);
+        let r: i16x8 = transmute(vhaddq_s16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmls_lane_u32() {
-        let a: u32x2 = u32x2::new(6, 7);
-        let b: u32x2 = u32x2::new(2, 2);
-        let c: u32x2 = u32x2::new(0, 3);
-        let e: u32x2 = u32x2::new(0, 1);
-        let r: u32x2 = transmute(vmls_lane_u32::<1>(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vhadd_s32() {
+        let a: i32x2 = i32x2::new(42, 42);
+        let b: i32x2 = i32x2::new(1, 2);
+        let e: i32x2 = i32x2::new(21, 22);
+        let r: i32x2 = transmute(vhadd_s32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmls_laneq_u32() {
-        let a: u32x2 = u32x2::new(6, 7);
-        let b: u32x2 = u32x2::new(2, 2);
-        let c: u32x4 = u32x4::new(0, 3, 0, 0);
-        let e: u32x2 = u32x2::new(0, 1);
-        let r: u32x2 = transmute(vmls_laneq_u32::<1>(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vhaddq_s32() {
+        let a: i32x4 = i32x4::new(42, 42, 42, 42);
+        let b: i32x4 = i32x4::new(1, 2, 3, 4);
+        let e: i32x4 = i32x4::new(21, 22, 22, 23);
+        let r: i32x4 = transmute(vhaddq_s32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlsq_lane_u32() {
-        let a: u32x4 = u32x4::new(6, 7, 8, 9);
-        let b: u32x4 = u32x4::new(2, 2, 2, 2);
-        let c: u32x2 = u32x2::new(0, 3);
-        let e: u32x4 = u32x4::new(0, 1, 2, 3);
-        let r: u32x4 = transmute(vmlsq_lane_u32::<1>(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vrhadd_u8() {
+        let a: u8x8 = u8x8::new(42, 42, 42, 42, 42, 42, 42, 42);
+        let b: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: u8x8 = u8x8::new(22, 22, 23, 23, 24, 24, 25, 25);
+        let r: u8x8 = transmute(vrhadd_u8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlsq_laneq_u32() {
-        let a: u32x4 = u32x4::new(6, 7, 8, 9);
-        let b: u32x4 = u32x4::new(2, 2, 2, 2);
-        let c: u32x4 = u32x4::new(0, 3, 0, 0);
-        let e: u32x4 = u32x4::new(0, 1, 2, 3);
-        let r: u32x4 = transmute(vmlsq_laneq_u32::<1>(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vrhaddq_u8() {
+        let a: u8x16 = u8x16::new(42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42);
+        let b: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let e: u8x16 = u8x16::new(22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29);
+        let r: u8x16 = transmute(vrhaddq_u8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmls_lane_f32() {
-        let a: f32x2 = f32x2::new(6., 7.);
-        let b: f32x2 = f32x2::new(2., 2.);
-        let c: f32x2 = f32x2::new(0., 3.);
-        let e: f32x2 = f32x2::new(0., 1.);
-        let r: f32x2 = transmute(vmls_lane_f32::<1>(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vrhadd_u16() {
+        let a: u16x4 = u16x4::new(42, 42, 42, 42);
+        let b: u16x4 = u16x4::new(1, 2, 3, 4);
+        let e: u16x4 = u16x4::new(22, 22, 23, 23);
+        let r: u16x4 = transmute(vrhadd_u16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmls_laneq_f32() {
-        let a: f32x2 = f32x2::new(6., 7.);
-        let b: f32x2 = f32x2::new(2., 2.);
-        let c: f32x4 = f32x4::new(0., 3., 0., 0.);
-        let e: f32x2 = f32x2::new(0., 1.);
-        let r: f32x2 = transmute(vmls_laneq_f32::<1>(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vrhaddq_u16() {
+        let a: u16x8 = u16x8::new(42, 42, 42, 42, 42, 42, 42, 42);
+        let b: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: u16x8 = u16x8::new(22, 22, 23, 23, 24, 24, 25, 25);
+        let r: u16x8 = transmute(vrhaddq_u16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlsq_lane_f32() {
-        let a: f32x4 = f32x4::new(6., 7., 8., 9.);
-        let b: f32x4 = f32x4::new(2., 2., 2., 2.);
-        let c: f32x2 = f32x2::new(0., 3.);
-        let e: f32x4 = f32x4::new(0., 1., 2., 3.);
-        let r: f32x4 = transmute(vmlsq_lane_f32::<1>(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vrhadd_u32() {
+        let a: u32x2 = u32x2::new(42, 42);
+        let b: u32x2 = u32x2::new(1, 2);
+        let e: u32x2 = u32x2::new(22, 22);
+        let r: u32x2 = transmute(vrhadd_u32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlsq_laneq_f32() {
-        let a: f32x4 = f32x4::new(6., 7., 8., 9.);
-        let b: f32x4 = f32x4::new(2., 2., 2., 2.);
-        let c: f32x4 = f32x4::new(0., 3., 0., 0.);
-        let e: f32x4 = f32x4::new(0., 1., 2., 3.);
-        let r: f32x4 = transmute(vmlsq_laneq_f32::<1>(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vrhaddq_u32() {
+        let a: u32x4 = u32x4::new(42, 42, 42, 42);
+        let b: u32x4 = u32x4::new(1, 2, 3, 4);
+        let e: u32x4 = u32x4::new(22, 22, 23, 23);
+        let r: u32x4 = transmute(vrhaddq_u32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlsl_s8() {
-        let a: i16x8 = i16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
-        let b: i8x8 = i8x8::new(2, 2, 2, 2, 2, 2, 2, 2);
-        let c: i8x8 = i8x8::new(3, 3, 3, 3, 3, 3, 3, 3);
-        let e: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
-        let r: i16x8 = transmute(vmlsl_s8(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vrhadd_s8() {
+        let a: i8x8 = i8x8::new(42, 42, 42, 42, 42, 42, 42, 42);
+        let b: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: i8x8 = i8x8::new(22, 22, 23, 23, 24, 24, 25, 25);
+        let r: i8x8 = transmute(vrhadd_s8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlsl_s16() {
-        let a: i32x4 = i32x4::new(6, 7, 8, 9);
-        let b: i16x4 = i16x4::new(2, 2, 2, 2);
-        let c: i16x4 = i16x4::new(3, 3, 3, 3);
-        let e: i32x4 = i32x4::new(0, 1, 2, 3);
-        let r: i32x4 = transmute(vmlsl_s16(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vrhaddq_s8() {
+        let a: i8x16 = i8x16::new(42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42);
+        let b: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let e: i8x16 = i8x16::new(22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29);
+        let r: i8x16 = transmute(vrhaddq_s8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlsl_s32() {
-        let a: i64x2 = i64x2::new(6, 7);
-        let b: i32x2 = i32x2::new(2, 2);
-        let c: i32x2 = i32x2::new(3, 3);
-        let e: i64x2 = i64x2::new(0, 1);
-        let r: i64x2 = transmute(vmlsl_s32(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vrhadd_s16() {
+        let a: i16x4 = i16x4::new(42, 42, 42, 42);
+        let b: i16x4 = i16x4::new(1, 2, 3, 4);
+        let e: i16x4 = i16x4::new(22, 22, 23, 23);
+        let r: i16x4 = transmute(vrhadd_s16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlsl_u8() {
-        let a: u16x8 = u16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
-        let b: u8x8 = u8x8::new(2, 2, 2, 2, 2, 2, 2, 2);
-        let c: u8x8 = u8x8::new(3, 3, 3, 3, 3, 3, 3, 3);
-        let e: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
-        let r: u16x8 = transmute(vmlsl_u8(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vrhaddq_s16() {
+        let a: i16x8 = i16x8::new(42, 42, 42, 42, 42, 42, 42, 42);
+        let b: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: i16x8 = i16x8::new(22, 22, 23, 23, 24, 24, 25, 25);
+        let r: i16x8 = transmute(vrhaddq_s16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlsl_u16() {
-        let a: u32x4 = u32x4::new(6, 7, 8, 9);
-        let b: u16x4 = u16x4::new(2, 2, 2, 2);
-        let c: u16x4 = u16x4::new(3, 3, 3, 3);
-        let e: u32x4 = u32x4::new(0, 1, 2, 3);
-        let r: u32x4 = transmute(vmlsl_u16(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vrhadd_s32() {
+        let a: i32x2 = i32x2::new(42, 42);
+        let b: i32x2 = i32x2::new(1, 2);
+        let e: i32x2 = i32x2::new(22, 22);
+        let r: i32x2 = transmute(vrhadd_s32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlsl_u32() {
-        let a: u64x2 = u64x2::new(6, 7);
-        let b: u32x2 = u32x2::new(2, 2);
-        let c: u32x2 = u32x2::new(3, 3);
-        let e: u64x2 = u64x2::new(0, 1);
-        let r: u64x2 = transmute(vmlsl_u32(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vrhaddq_s32() {
+        let a: i32x4 = i32x4::new(42, 42, 42, 42);
+        let b: i32x4 = i32x4::new(1, 2, 3, 4);
+        let e: i32x4 = i32x4::new(22, 22, 23, 23);
+        let r: i32x4 = transmute(vrhaddq_s32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlsl_n_s16() {
-        let a: i32x4 = i32x4::new(6, 7, 8, 9);
-        let b: i16x4 = i16x4::new(2, 2, 2, 2);
-        let c: i16 = 3;
-        let e: i32x4 = i32x4::new(0, 1, 2, 3);
-        let r: i32x4 = transmute(vmlsl_n_s16(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vrndn_f32() {
+        let a: f32x2 = f32x2::new(-1.5, 0.5);
+        let e: f32x2 = f32x2::new(-2.0, 0.0);
+        let r: f32x2 = transmute(vrndn_f32(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlsl_n_s32() {
-        let a: i64x2 = i64x2::new(6, 7);
-        let b: i32x2 = i32x2::new(2, 2);
-        let c: i32 = 3;
-        let e: i64x2 = i64x2::new(0, 1);
-        let r: i64x2 = transmute(vmlsl_n_s32(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vrndnq_f32() {
+        let a: f32x4 = f32x4::new(-1.5, 0.5, 1.5, 2.5);
+        let e: f32x4 = f32x4::new(-2.0, 0.0, 2.0, 2.0);
+        let r: f32x4 = transmute(vrndnq_f32(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlsl_n_u16() {
-        let a: u32x4 = u32x4::new(6, 7, 8, 9);
-        let b: u16x4 = u16x4::new(2, 2, 2, 2);
-        let c: u16 = 3;
-        let e: u32x4 = u32x4::new(0, 1, 2, 3);
-        let r: u32x4 = transmute(vmlsl_n_u16(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vqadd_u8() {
+        let a: u8x8 = u8x8::new(42, 42, 42, 42, 42, 42, 42, 42);
+        let b: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: u8x8 = u8x8::new(43, 44, 45, 46, 47, 48, 49, 50);
+        let r: u8x8 = transmute(vqadd_u8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlsl_n_u32() {
-        let a: u64x2 = u64x2::new(6, 7);
-        let b: u32x2 = u32x2::new(2, 2);
-        let c: u32 = 3;
-        let e: u64x2 = u64x2::new(0, 1);
-        let r: u64x2 = transmute(vmlsl_n_u32(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vqaddq_u8() {
+        let a: u8x16 = u8x16::new(42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42);
+        let b: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let e: u8x16 = u8x16::new(43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58);
+        let r: u8x16 = transmute(vqaddq_u8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlsl_lane_s16() {
-        let a: i32x4 = i32x4::new(6, 7, 8, 9);
-        let b: i16x4 = i16x4::new(2, 2, 2, 2);
-        let c: i16x4 = i16x4::new(0, 3, 0, 0);
-        let e: i32x4 = i32x4::new(0, 1, 2, 3);
-        let r: i32x4 = transmute(vmlsl_lane_s16::<1>(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vqadd_u16() {
+        let a: u16x4 = u16x4::new(42, 42, 42, 42);
+        let b: u16x4 = u16x4::new(1, 2, 3, 4);
+        let e: u16x4 = u16x4::new(43, 44, 45, 46);
+        let r: u16x4 = transmute(vqadd_u16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlsl_laneq_s16() {
-        let a: i32x4 = i32x4::new(6, 7, 8, 9);
-        let b: i16x4 = i16x4::new(2, 2, 2, 2);
-        let c: i16x8 = i16x8::new(0, 3, 0, 0, 0, 0, 0, 0);
-        let e: i32x4 = i32x4::new(0, 1, 2, 3);
-        let r: i32x4 = transmute(vmlsl_laneq_s16::<1>(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vqaddq_u16() {
+        let a: u16x8 = u16x8::new(42, 42, 42, 42, 42, 42, 42, 42);
+        let b: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: u16x8 = u16x8::new(43, 44, 45, 46, 47, 48, 49, 50);
+        let r: u16x8 = transmute(vqaddq_u16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlsl_lane_s32() {
-        let a: i64x2 = i64x2::new(6, 7);
-        let b: i32x2 = i32x2::new(2, 2);
-        let c: i32x2 = i32x2::new(0, 3);
-        let e: i64x2 = i64x2::new(0, 1);
-        let r: i64x2 = transmute(vmlsl_lane_s32::<1>(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vqadd_u32() {
+        let a: u32x2 = u32x2::new(42, 42);
+        let b: u32x2 = u32x2::new(1, 2);
+        let e: u32x2 = u32x2::new(43, 44);
+        let r: u32x2 = transmute(vqadd_u32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlsl_laneq_s32() {
-        let a: i64x2 = i64x2::new(6, 7);
-        let b: i32x2 = i32x2::new(2, 2);
-        let c: i32x4 = i32x4::new(0, 3, 0, 0);
-        let e: i64x2 = i64x2::new(0, 1);
-        let r: i64x2 = transmute(vmlsl_laneq_s32::<1>(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vqaddq_u32() {
+        let a: u32x4 = u32x4::new(42, 42, 42, 42);
+        let b: u32x4 = u32x4::new(1, 2, 3, 4);
+        let e: u32x4 = u32x4::new(43, 44, 45, 46);
+        let r: u32x4 = transmute(vqaddq_u32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlsl_lane_u16() {
-        let a: u32x4 = u32x4::new(6, 7, 8, 9);
-        let b: u16x4 = u16x4::new(2, 2, 2, 2);
-        let c: u16x4 = u16x4::new(0, 3, 0, 0);
-        let e: u32x4 = u32x4::new(0, 1, 2, 3);
-        let r: u32x4 = transmute(vmlsl_lane_u16::<1>(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vqadd_u64() {
+        let a: u64x1 = u64x1::new(42);
+        let b: u64x1 = u64x1::new(1);
+        let e: u64x1 = u64x1::new(43);
+        let r: u64x1 = transmute(vqadd_u64(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlsl_laneq_u16() {
-        let a: u32x4 = u32x4::new(6, 7, 8, 9);
-        let b: u16x4 = u16x4::new(2, 2, 2, 2);
-        let c: u16x8 = u16x8::new(0, 3, 0, 0, 0, 0, 0, 0);
-        let e: u32x4 = u32x4::new(0, 1, 2, 3);
-        let r: u32x4 = transmute(vmlsl_laneq_u16::<1>(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vqaddq_u64() {
+        let a: u64x2 = u64x2::new(42, 42);
+        let b: u64x2 = u64x2::new(1, 2);
+        let e: u64x2 = u64x2::new(43, 44);
+        let r: u64x2 = transmute(vqaddq_u64(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlsl_lane_u32() {
-        let a: u64x2 = u64x2::new(6, 7);
-        let b: u32x2 = u32x2::new(2, 2);
-        let c: u32x2 = u32x2::new(0, 3);
-        let e: u64x2 = u64x2::new(0, 1);
-        let r: u64x2 = transmute(vmlsl_lane_u32::<1>(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vqadd_s8() {
+        let a: i8x8 = i8x8::new(42, 42, 42, 42, 42, 42, 42, 42);
+        let b: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: i8x8 = i8x8::new(43, 44, 45, 46, 47, 48, 49, 50);
+        let r: i8x8 = transmute(vqadd_s8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlsl_laneq_u32() {
-        let a: u64x2 = u64x2::new(6, 7);
-        let b: u32x2 = u32x2::new(2, 2);
-        let c: u32x4 = u32x4::new(0, 3, 0, 0);
-        let e: u64x2 = u64x2::new(0, 1);
-        let r: u64x2 = transmute(vmlsl_laneq_u32::<1>(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vqaddq_s8() {
+        let a: i8x16 = i8x16::new(42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42);
+        let b: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let e: i8x16 = i8x16::new(43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58);
+        let r: i8x16 = transmute(vqaddq_s8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vneg_s8() {
-        let a: i8x8 = i8x8::new(0, 1, -1, 2, -2, 3, -3, 4);
-        let e: i8x8 = i8x8::new(0, -1, 1, -2, 2, -3, 3, -4);
-        let r: i8x8 = transmute(vneg_s8(transmute(a)));
+    unsafe fn test_vqadd_s16() {
+        let a: i16x4 = i16x4::new(42, 42, 42, 42);
+        let b: i16x4 = i16x4::new(1, 2, 3, 4);
+        let e: i16x4 = i16x4::new(43, 44, 45, 46);
+        let r: i16x4 = transmute(vqadd_s16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vnegq_s8() {
-        let a: i8x16 = i8x16::new(0, 1, -1, 2, -2, 3, -3, 4, -4, 5, -5, 6, -6, 7, -7, 8);
-        let e: i8x16 = i8x16::new(0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7, -8);
-        let r: i8x16 = transmute(vnegq_s8(transmute(a)));
+    unsafe fn test_vqaddq_s16() {
+        let a: i16x8 = i16x8::new(42, 42, 42, 42, 42, 42, 42, 42);
+        let b: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: i16x8 = i16x8::new(43, 44, 45, 46, 47, 48, 49, 50);
+        let r: i16x8 = transmute(vqaddq_s16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vneg_s16() {
-        let a: i16x4 = i16x4::new(0, 1, -1, 2);
-        let e: i16x4 = i16x4::new(0, -1, 1, -2);
-        let r: i16x4 = transmute(vneg_s16(transmute(a)));
+    unsafe fn test_vqadd_s32() {
+        let a: i32x2 = i32x2::new(42, 42);
+        let b: i32x2 = i32x2::new(1, 2);
+        let e: i32x2 = i32x2::new(43, 44);
+        let r: i32x2 = transmute(vqadd_s32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vnegq_s16() {
-        let a: i16x8 = i16x8::new(0, 1, -1, 2, -2, 3, -3, 4);
-        let e: i16x8 = i16x8::new(0, -1, 1, -2, 2, -3, 3, -4);
-        let r: i16x8 = transmute(vnegq_s16(transmute(a)));
+    unsafe fn test_vqaddq_s32() {
+        let a: i32x4 = i32x4::new(42, 42, 42, 42);
+        let b: i32x4 = i32x4::new(1, 2, 3, 4);
+        let e: i32x4 = i32x4::new(43, 44, 45, 46);
+        let r: i32x4 = transmute(vqaddq_s32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vneg_s32() {
-        let a: i32x2 = i32x2::new(0, 1);
-        let e: i32x2 = i32x2::new(0, -1);
-        let r: i32x2 = transmute(vneg_s32(transmute(a)));
+    unsafe fn test_vqadd_s64() {
+        let a: i64x1 = i64x1::new(42);
+        let b: i64x1 = i64x1::new(1);
+        let e: i64x1 = i64x1::new(43);
+        let r: i64x1 = transmute(vqadd_s64(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vnegq_s32() {
-        let a: i32x4 = i32x4::new(0, 1, -1, 2);
-        let e: i32x4 = i32x4::new(0, -1, 1, -2);
-        let r: i32x4 = transmute(vnegq_s32(transmute(a)));
+    unsafe fn test_vqaddq_s64() {
+        let a: i64x2 = i64x2::new(42, 42);
+        let b: i64x2 = i64x2::new(1, 2);
+        let e: i64x2 = i64x2::new(43, 44);
+        let r: i64x2 = transmute(vqaddq_s64(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vneg_f32() {
-        let a: f32x2 = f32x2::new(0., 1.);
-        let e: f32x2 = f32x2::new(0., -1.);
-        let r: f32x2 = transmute(vneg_f32(transmute(a)));
+    unsafe fn test_vld1_s8_x2() {
+        let a: [i8; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let e: [i8x8; 2] = [i8x8::new(1, 2, 3, 4, 5, 6, 7, 8), i8x8::new(9, 10, 11, 12, 13, 14, 15, 16)];
+        let r: [i8x8; 2] = transmute(vld1_s8_x2(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vnegq_f32() {
-        let a: f32x4 = f32x4::new(0., 1., -1., 2.);
-        let e: f32x4 = f32x4::new(0., -1., 1., -2.);
-        let r: f32x4 = transmute(vnegq_f32(transmute(a)));
+    unsafe fn test_vld1_s16_x2() {
+        let a: [i16; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+        let e: [i16x4; 2] = [i16x4::new(1, 2, 3, 4), i16x4::new(5, 6, 7, 8)];
+        let r: [i16x4; 2] = transmute(vld1_s16_x2(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vqneg_s8() {
-        let a: i8x8 = i8x8::new(-128, 0, 1, -1, 2, -2, 3, -3);
-        let e: i8x8 = i8x8::new(0x7F, 0, -1, 1, -2, 2, -3, 3);
-        let r: i8x8 = transmute(vqneg_s8(transmute(a)));
+    unsafe fn test_vld1_s32_x2() {
+        let a: [i32; 5] = [0, 1, 2, 3, 4];
+        let e: [i32x2; 2] = [i32x2::new(1, 2), i32x2::new(3, 4)];
+        let r: [i32x2; 2] = transmute(vld1_s32_x2(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vqnegq_s8() {
-        let a: i8x16 = i8x16::new(-128, 0, 1, -1, 2, -2, 3, -3, 4, -4, 5, -5, 6, -6, 7, -7);
-        let e: i8x16 = i8x16::new(0x7F, 0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7);
-        let r: i8x16 = transmute(vqnegq_s8(transmute(a)));
+    unsafe fn test_vld1_s64_x2() {
+        let a: [i64; 3] = [0, 1, 2];
+        let e: [i64x1; 2] = [i64x1::new(1), i64x1::new(2)];
+        let r: [i64x1; 2] = transmute(vld1_s64_x2(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vqneg_s16() {
-        let a: i16x4 = i16x4::new(-32768, 0, 1, -1);
-        let e: i16x4 = i16x4::new(0x7F_FF, 0, -1, 1);
-        let r: i16x4 = transmute(vqneg_s16(transmute(a)));
+    unsafe fn test_vld1q_s8_x2() {
+        let a: [i8; 33] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let e: [i8x16; 2] = [i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), i8x16::new(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32)];
+        let r: [i8x16; 2] = transmute(vld1q_s8_x2(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vqnegq_s16() {
-        let a: i16x8 = i16x8::new(-32768, 0, 1, -1, 2, -2, 3, -3);
-        let e: i16x8 = i16x8::new(0x7F_FF, 0, -1, 1, -2, 2, -3, 3);
-        let r: i16x8 = transmute(vqnegq_s16(transmute(a)));
+    unsafe fn test_vld1q_s16_x2() {
+        let a: [i16; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let e: [i16x8; 2] = [i16x8::new(1, 2, 3, 4, 5, 6, 7, 8), i16x8::new(9, 10, 11, 12, 13, 14, 15, 16)];
+        let r: [i16x8; 2] = transmute(vld1q_s16_x2(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vqneg_s32() {
-        let a: i32x2 = i32x2::new(-2147483648, 0);
-        let e: i32x2 = i32x2::new(0x7F_FF_FF_FF, 0);
-        let r: i32x2 = transmute(vqneg_s32(transmute(a)));
+    unsafe fn test_vld1q_s32_x2() {
+        let a: [i32; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+        let e: [i32x4; 2] = [i32x4::new(1, 2, 3, 4), i32x4::new(5, 6, 7, 8)];
+        let r: [i32x4; 2] = transmute(vld1q_s32_x2(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vqnegq_s32() {
-        let a: i32x4 = i32x4::new(-2147483648, 0, 1, -1);
-        let e: i32x4 = i32x4::new(0x7F_FF_FF_FF, 0, -1, 1);
-        let r: i32x4 = transmute(vqnegq_s32(transmute(a)));
+    unsafe fn test_vld1q_s64_x2() {
+        let a: [i64; 5] = [0, 1, 2, 3, 4];
+        let e: [i64x2; 2] = [i64x2::new(1, 2), i64x2::new(3, 4)];
+        let r: [i64x2; 2] = transmute(vld1q_s64_x2(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vqsub_u8() {
-        let a: u8x8 = u8x8::new(42, 42, 42, 42, 42, 42, 42, 42);
-        let b: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let e: u8x8 = u8x8::new(41, 40, 39, 38, 37, 36, 35, 34);
-        let r: u8x8 = transmute(vqsub_u8(transmute(a), transmute(b)));
+    unsafe fn test_vld1_s8_x3() {
+        let a: [i8; 25] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24];
+        let e: [i8x8; 3] = [i8x8::new(1, 2, 3, 4, 5, 6, 7, 8), i8x8::new(9, 10, 11, 12, 13, 14, 15, 16), i8x8::new(17, 18, 19, 20, 21, 22, 23, 24)];
+        let r: [i8x8; 3] = transmute(vld1_s8_x3(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vqsubq_u8() {
-        let a: u8x16 = u8x16::new(42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42);
-        let b: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let e: u8x16 = u8x16::new(41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26);
-        let r: u8x16 = transmute(vqsubq_u8(transmute(a), transmute(b)));
+    unsafe fn test_vld1_s16_x3() {
+        let a: [i16; 13] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
+        let e: [i16x4; 3] = [i16x4::new(1, 2, 3, 4), i16x4::new(5, 6, 7, 8), i16x4::new(9, 10, 11, 12)];
+        let r: [i16x4; 3] = transmute(vld1_s16_x3(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vqsub_u16() {
-        let a: u16x4 = u16x4::new(42, 42, 42, 42);
-        let b: u16x4 = u16x4::new(1, 2, 3, 4);
-        let e: u16x4 = u16x4::new(41, 40, 39, 38);
-        let r: u16x4 = transmute(vqsub_u16(transmute(a), transmute(b)));
+    unsafe fn test_vld1_s32_x3() {
+        let a: [i32; 7] = [0, 1, 2, 3, 4, 5, 6];
+        let e: [i32x2; 3] = [i32x2::new(1, 2), i32x2::new(3, 4), i32x2::new(5, 6)];
+        let r: [i32x2; 3] = transmute(vld1_s32_x3(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vqsubq_u16() {
-        let a: u16x8 = u16x8::new(42, 42, 42, 42, 42, 42, 42, 42);
-        let b: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let e: u16x8 = u16x8::new(41, 40, 39, 38, 37, 36, 35, 34);
-        let r: u16x8 = transmute(vqsubq_u16(transmute(a), transmute(b)));
+    unsafe fn test_vld1_s64_x3() {
+        let a: [i64; 4] = [0, 1, 2, 3];
+        let e: [i64x1; 3] = [i64x1::new(1), i64x1::new(2), i64x1::new(3)];
+        let r: [i64x1; 3] = transmute(vld1_s64_x3(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vqsub_u32() {
-        let a: u32x2 = u32x2::new(42, 42);
-        let b: u32x2 = u32x2::new(1, 2);
-        let e: u32x2 = u32x2::new(41, 40);
-        let r: u32x2 = transmute(vqsub_u32(transmute(a), transmute(b)));
+    unsafe fn test_vld1q_s8_x3() {
+        let a: [i8; 49] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let e: [i8x16; 3] = [i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), i8x16::new(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32), i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16)];
+        let r: [i8x16; 3] = transmute(vld1q_s8_x3(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vqsubq_u32() {
-        let a: u32x4 = u32x4::new(42, 42, 42, 42);
-        let b: u32x4 = u32x4::new(1, 2, 3, 4);
-        let e: u32x4 = u32x4::new(41, 40, 39, 38);
-        let r: u32x4 = transmute(vqsubq_u32(transmute(a), transmute(b)));
+    unsafe fn test_vld1q_s16_x3() {
+        let a: [i16; 25] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24];
+        let e: [i16x8; 3] = [i16x8::new(1, 2, 3, 4, 5, 6, 7, 8), i16x8::new(9, 10, 11, 12, 13, 14, 15, 16), i16x8::new(17, 18, 19, 20, 21, 22, 23, 24)];
+        let r: [i16x8; 3] = transmute(vld1q_s16_x3(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vqsub_u64() {
-        let a: u64x1 = u64x1::new(42);
-        let b: u64x1 = u64x1::new(1);
-        let e: u64x1 = u64x1::new(41);
-        let r: u64x1 = transmute(vqsub_u64(transmute(a), transmute(b)));
+    unsafe fn test_vld1q_s32_x3() {
+        let a: [i32; 13] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
+        let e: [i32x4; 3] = [i32x4::new(1, 2, 3, 4), i32x4::new(5, 6, 7, 8), i32x4::new(9, 10, 11, 12)];
+        let r: [i32x4; 3] = transmute(vld1q_s32_x3(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vqsubq_u64() {
-        let a: u64x2 = u64x2::new(42, 42);
-        let b: u64x2 = u64x2::new(1, 2);
-        let e: u64x2 = u64x2::new(41, 40);
-        let r: u64x2 = transmute(vqsubq_u64(transmute(a), transmute(b)));
+    unsafe fn test_vld1q_s64_x3() {
+        let a: [i64; 7] = [0, 1, 2, 3, 4, 5, 6];
+        let e: [i64x2; 3] = [i64x2::new(1, 2), i64x2::new(3, 4), i64x2::new(5, 6)];
+        let r: [i64x2; 3] = transmute(vld1q_s64_x3(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vqsub_s8() {
-        let a: i8x8 = i8x8::new(42, 42, 42, 42, 42, 42, 42, 42);
-        let b: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let e: i8x8 = i8x8::new(41, 40, 39, 38, 37, 36, 35, 34);
-        let r: i8x8 = transmute(vqsub_s8(transmute(a), transmute(b)));
+    unsafe fn test_vld1_s8_x4() {
+        let a: [i8; 33] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let e: [i8x8; 4] = [i8x8::new(1, 2, 3, 4, 5, 6, 7, 8), i8x8::new(9, 10, 11, 12, 13, 14, 15, 16), i8x8::new(17, 18, 19, 20, 21, 22, 23, 24), i8x8::new(25, 26, 27, 28, 29, 30, 31, 32)];
+        let r: [i8x8; 4] = transmute(vld1_s8_x4(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vqsubq_s8() {
-        let a: i8x16 = i8x16::new(42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42);
-        let b: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let e: i8x16 = i8x16::new(41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26);
-        let r: i8x16 = transmute(vqsubq_s8(transmute(a), transmute(b)));
+    unsafe fn test_vld1_s16_x4() {
+        let a: [i16; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let e: [i16x4; 4] = [i16x4::new(1, 2, 3, 4), i16x4::new(5, 6, 7, 8), i16x4::new(9, 10, 11, 12), i16x4::new(13, 14, 15, 16)];
+        let r: [i16x4; 4] = transmute(vld1_s16_x4(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vqsub_s16() {
-        let a: i16x4 = i16x4::new(42, 42, 42, 42);
-        let b: i16x4 = i16x4::new(1, 2, 3, 4);
-        let e: i16x4 = i16x4::new(41, 40, 39, 38);
-        let r: i16x4 = transmute(vqsub_s16(transmute(a), transmute(b)));
+    unsafe fn test_vld1_s32_x4() {
+        let a: [i32; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+        let e: [i32x2; 4] = [i32x2::new(1, 2), i32x2::new(3, 4), i32x2::new(5, 6), i32x2::new(7, 8)];
+        let r: [i32x2; 4] = transmute(vld1_s32_x4(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vqsubq_s16() {
-        let a: i16x8 = i16x8::new(42, 42, 42, 42, 42, 42, 42, 42);
-        let b: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let e: i16x8 = i16x8::new(41, 40, 39, 38, 37, 36, 35, 34);
-        let r: i16x8 = transmute(vqsubq_s16(transmute(a), transmute(b)));
+    unsafe fn test_vld1_s64_x4() {
+        let a: [i64; 5] = [0, 1, 2, 3, 4];
+        let e: [i64x1; 4] = [i64x1::new(1), i64x1::new(2), i64x1::new(3), i64x1::new(4)];
+        let r: [i64x1; 4] = transmute(vld1_s64_x4(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vqsub_s32() {
-        let a: i32x2 = i32x2::new(42, 42);
-        let b: i32x2 = i32x2::new(1, 2);
-        let e: i32x2 = i32x2::new(41, 40);
-        let r: i32x2 = transmute(vqsub_s32(transmute(a), transmute(b)));
+    unsafe fn test_vld1q_s8_x4() {
+        let a: [i8; 65] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let e: [i8x16; 4] = [i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), i8x16::new(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32), i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), i8x16::new(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32)];
+        let r: [i8x16; 4] = transmute(vld1q_s8_x4(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vqsubq_s32() {
-        let a: i32x4 = i32x4::new(42, 42, 42, 42);
-        let b: i32x4 = i32x4::new(1, 2, 3, 4);
-        let e: i32x4 = i32x4::new(41, 40, 39, 38);
-        let r: i32x4 = transmute(vqsubq_s32(transmute(a), transmute(b)));
+    unsafe fn test_vld1q_s16_x4() {
+        let a: [i16; 33] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let e: [i16x8; 4] = [i16x8::new(1, 2, 3, 4, 5, 6, 7, 8), i16x8::new(9, 10, 11, 12, 13, 14, 15, 16), i16x8::new(17, 18, 19, 20, 21, 22, 23, 24), i16x8::new(25, 26, 27, 28, 29, 30, 31, 32)];
+        let r: [i16x8; 4] = transmute(vld1q_s16_x4(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vqsub_s64() {
-        let a: i64x1 = i64x1::new(42);
-        let b: i64x1 = i64x1::new(1);
-        let e: i64x1 = i64x1::new(41);
-        let r: i64x1 = transmute(vqsub_s64(transmute(a), transmute(b)));
+    unsafe fn test_vld1q_s32_x4() {
+        let a: [i32; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let e: [i32x4; 4] = [i32x4::new(1, 2, 3, 4), i32x4::new(5, 6, 7, 8), i32x4::new(9, 10, 11, 12), i32x4::new(13, 14, 15, 16)];
+        let r: [i32x4; 4] = transmute(vld1q_s32_x4(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vqsubq_s64() {
-        let a: i64x2 = i64x2::new(42, 42);
-        let b: i64x2 = i64x2::new(1, 2);
-        let e: i64x2 = i64x2::new(41, 40);
-        let r: i64x2 = transmute(vqsubq_s64(transmute(a), transmute(b)));
+    unsafe fn test_vld1q_s64_x4() {
+        let a: [i64; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+        let e: [i64x2; 4] = [i64x2::new(1, 2), i64x2::new(3, 4), i64x2::new(5, 6), i64x2::new(7, 8)];
+        let r: [i64x2; 4] = transmute(vld1q_s64_x4(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vhadd_u8() {
-        let a: u8x8 = u8x8::new(42, 42, 42, 42, 42, 42, 42, 42);
-        let b: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let e: u8x8 = u8x8::new(21, 22, 22, 23, 23, 24, 24, 25);
-        let r: u8x8 = transmute(vhadd_u8(transmute(a), transmute(b)));
+    unsafe fn test_vld1_u8_x2() {
+        let a: [u8; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let e: [u8x8; 2] = [u8x8::new(1, 2, 3, 4, 5, 6, 7, 8), u8x8::new(9, 10, 11, 12, 13, 14, 15, 16)];
+        let r: [u8x8; 2] = transmute(vld1_u8_x2(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vhaddq_u8() {
-        let a: u8x16 = u8x16::new(42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42);
-        let b: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let e: u8x16 = u8x16::new(21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29);
-        let r: u8x16 = transmute(vhaddq_u8(transmute(a), transmute(b)));
+    unsafe fn test_vld1_u16_x2() {
+        let a: [u16; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+        let e: [u16x4; 2] = [u16x4::new(1, 2, 3, 4), u16x4::new(5, 6, 7, 8)];
+        let r: [u16x4; 2] = transmute(vld1_u16_x2(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vhadd_u16() {
-        let a: u16x4 = u16x4::new(42, 42, 42, 42);
-        let b: u16x4 = u16x4::new(1, 2, 3, 4);
-        let e: u16x4 = u16x4::new(21, 22, 22, 23);
-        let r: u16x4 = transmute(vhadd_u16(transmute(a), transmute(b)));
+    unsafe fn test_vld1_u32_x2() {
+        let a: [u32; 5] = [0, 1, 2, 3, 4];
+        let e: [u32x2; 2] = [u32x2::new(1, 2), u32x2::new(3, 4)];
+        let r: [u32x2; 2] = transmute(vld1_u32_x2(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vhaddq_u16() {
-        let a: u16x8 = u16x8::new(42, 42, 42, 42, 42, 42, 42, 42);
-        let b: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let e: u16x8 = u16x8::new(21, 22, 22, 23, 23, 24, 24, 25);
-        let r: u16x8 = transmute(vhaddq_u16(transmute(a), transmute(b)));
+    unsafe fn test_vld1_u64_x2() {
+        let a: [u64; 3] = [0, 1, 2];
+        let e: [u64x1; 2] = [u64x1::new(1), u64x1::new(2)];
+        let r: [u64x1; 2] = transmute(vld1_u64_x2(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vhadd_u32() {
-        let a: u32x2 = u32x2::new(42, 42);
-        let b: u32x2 = u32x2::new(1, 2);
-        let e: u32x2 = u32x2::new(21, 22);
-        let r: u32x2 = transmute(vhadd_u32(transmute(a), transmute(b)));
+    unsafe fn test_vld1q_u8_x2() {
+        let a: [u8; 33] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let e: [u8x16; 2] = [u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), u8x16::new(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32)];
+        let r: [u8x16; 2] = transmute(vld1q_u8_x2(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vhaddq_u32() {
-        let a: u32x4 = u32x4::new(42, 42, 42, 42);
-        let b: u32x4 = u32x4::new(1, 2, 3, 4);
-        let e: u32x4 = u32x4::new(21, 22, 22, 23);
-        let r: u32x4 = transmute(vhaddq_u32(transmute(a), transmute(b)));
+    unsafe fn test_vld1q_u16_x2() {
+        let a: [u16; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let e: [u16x8; 2] = [u16x8::new(1, 2, 3, 4, 5, 6, 7, 8), u16x8::new(9, 10, 11, 12, 13, 14, 15, 16)];
+        let r: [u16x8; 2] = transmute(vld1q_u16_x2(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vhadd_s8() {
-        let a: i8x8 = i8x8::new(42, 42, 42, 42, 42, 42, 42, 42);
-        let b: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let e: i8x8 = i8x8::new(21, 22, 22, 23, 23, 24, 24, 25);
-        let r: i8x8 = transmute(vhadd_s8(transmute(a), transmute(b)));
+    unsafe fn test_vld1q_u32_x2() {
+        let a: [u32; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+        let e: [u32x4; 2] = [u32x4::new(1, 2, 3, 4), u32x4::new(5, 6, 7, 8)];
+        let r: [u32x4; 2] = transmute(vld1q_u32_x2(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vhaddq_s8() {
-        let a: i8x16 = i8x16::new(42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42);
-        let b: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let e: i8x16 = i8x16::new(21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29);
-        let r: i8x16 = transmute(vhaddq_s8(transmute(a), transmute(b)));
+    unsafe fn test_vld1q_u64_x2() {
+        let a: [u64; 5] = [0, 1, 2, 3, 4];
+        let e: [u64x2; 2] = [u64x2::new(1, 2), u64x2::new(3, 4)];
+        let r: [u64x2; 2] = transmute(vld1q_u64_x2(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vhadd_s16() {
-        let a: i16x4 = i16x4::new(42, 42, 42, 42);
-        let b: i16x4 = i16x4::new(1, 2, 3, 4);
-        let e: i16x4 = i16x4::new(21, 22, 22, 23);
-        let r: i16x4 = transmute(vhadd_s16(transmute(a), transmute(b)));
+    unsafe fn test_vld1_u8_x3() {
+        let a: [u8; 25] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24];
+        let e: [u8x8; 3] = [u8x8::new(1, 2, 3, 4, 5, 6, 7, 8), u8x8::new(9, 10, 11, 12, 13, 14, 15, 16), u8x8::new(17, 18, 19, 20, 21, 22, 23, 24)];
+        let r: [u8x8; 3] = transmute(vld1_u8_x3(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vhaddq_s16() {
-        let a: i16x8 = i16x8::new(42, 42, 42, 42, 42, 42, 42, 42);
-        let b: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let e: i16x8 = i16x8::new(21, 22, 22, 23, 23, 24, 24, 25);
-        let r: i16x8 = transmute(vhaddq_s16(transmute(a), transmute(b)));
+    unsafe fn test_vld1_u16_x3() {
+        let a: [u16; 13] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
+        let e: [u16x4; 3] = [u16x4::new(1, 2, 3, 4), u16x4::new(5, 6, 7, 8), u16x4::new(9, 10, 11, 12)];
+        let r: [u16x4; 3] = transmute(vld1_u16_x3(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vhadd_s32() {
-        let a: i32x2 = i32x2::new(42, 42);
-        let b: i32x2 = i32x2::new(1, 2);
-        let e: i32x2 = i32x2::new(21, 22);
-        let r: i32x2 = transmute(vhadd_s32(transmute(a), transmute(b)));
+    unsafe fn test_vld1_u32_x3() {
+        let a: [u32; 7] = [0, 1, 2, 3, 4, 5, 6];
+        let e: [u32x2; 3] = [u32x2::new(1, 2), u32x2::new(3, 4), u32x2::new(5, 6)];
+        let r: [u32x2; 3] = transmute(vld1_u32_x3(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vhaddq_s32() {
-        let a: i32x4 = i32x4::new(42, 42, 42, 42);
-        let b: i32x4 = i32x4::new(1, 2, 3, 4);
-        let e: i32x4 = i32x4::new(21, 22, 22, 23);
-        let r: i32x4 = transmute(vhaddq_s32(transmute(a), transmute(b)));
+    unsafe fn test_vld1_u64_x3() {
+        let a: [u64; 4] = [0, 1, 2, 3];
+        let e: [u64x1; 3] = [u64x1::new(1), u64x1::new(2), u64x1::new(3)];
+        let r: [u64x1; 3] = transmute(vld1_u64_x3(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vrhadd_u8() {
-        let a: u8x8 = u8x8::new(42, 42, 42, 42, 42, 42, 42, 42);
-        let b: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let e: u8x8 = u8x8::new(22, 22, 23, 23, 24, 24, 25, 25);
-        let r: u8x8 = transmute(vrhadd_u8(transmute(a), transmute(b)));
+    unsafe fn test_vld1q_u8_x3() {
+        let a: [u8; 49] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let e: [u8x16; 3] = [u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), u8x16::new(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32), u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16)];
+        let r: [u8x16; 3] = transmute(vld1q_u8_x3(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vrhaddq_u8() {
-        let a: u8x16 = u8x16::new(42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42);
-        let b: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let e: u8x16 = u8x16::new(22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29);
-        let r: u8x16 = transmute(vrhaddq_u8(transmute(a), transmute(b)));
+    unsafe fn test_vld1q_u16_x3() {
+        let a: [u16; 25] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24];
+        let e: [u16x8; 3] = [u16x8::new(1, 2, 3, 4, 5, 6, 7, 8), u16x8::new(9, 10, 11, 12, 13, 14, 15, 16), u16x8::new(17, 18, 19, 20, 21, 22, 23, 24)];
+        let r: [u16x8; 3] = transmute(vld1q_u16_x3(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vrhadd_u16() {
-        let a: u16x4 = u16x4::new(42, 42, 42, 42);
-        let b: u16x4 = u16x4::new(1, 2, 3, 4);
-        let e: u16x4 = u16x4::new(22, 22, 23, 23);
-        let r: u16x4 = transmute(vrhadd_u16(transmute(a), transmute(b)));
+    unsafe fn test_vld1q_u32_x3() {
+        let a: [u32; 13] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
+        let e: [u32x4; 3] = [u32x4::new(1, 2, 3, 4), u32x4::new(5, 6, 7, 8), u32x4::new(9, 10, 11, 12)];
+        let r: [u32x4; 3] = transmute(vld1q_u32_x3(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vrhaddq_u16() {
-        let a: u16x8 = u16x8::new(42, 42, 42, 42, 42, 42, 42, 42);
-        let b: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let e: u16x8 = u16x8::new(22, 22, 23, 23, 24, 24, 25, 25);
-        let r: u16x8 = transmute(vrhaddq_u16(transmute(a), transmute(b)));
+    unsafe fn test_vld1q_u64_x3() {
+        let a: [u64; 7] = [0, 1, 2, 3, 4, 5, 6];
+        let e: [u64x2; 3] = [u64x2::new(1, 2), u64x2::new(3, 4), u64x2::new(5, 6)];
+        let r: [u64x2; 3] = transmute(vld1q_u64_x3(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vrhadd_u32() {
-        let a: u32x2 = u32x2::new(42, 42);
-        let b: u32x2 = u32x2::new(1, 2);
-        let e: u32x2 = u32x2::new(22, 22);
-        let r: u32x2 = transmute(vrhadd_u32(transmute(a), transmute(b)));
+    unsafe fn test_vld1_u8_x4() {
+        let a: [u8; 33] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let e: [u8x8; 4] = [u8x8::new(1, 2, 3, 4, 5, 6, 7, 8), u8x8::new(9, 10, 11, 12, 13, 14, 15, 16), u8x8::new(17, 18, 19, 20, 21, 22, 23, 24), u8x8::new(25, 26, 27, 28, 29, 30, 31, 32)];
+        let r: [u8x8; 4] = transmute(vld1_u8_x4(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vrhaddq_u32() {
-        let a: u32x4 = u32x4::new(42, 42, 42, 42);
-        let b: u32x4 = u32x4::new(1, 2, 3, 4);
-        let e: u32x4 = u32x4::new(22, 22, 23, 23);
-        let r: u32x4 = transmute(vrhaddq_u32(transmute(a), transmute(b)));
+    unsafe fn test_vld1_u16_x4() {
+        let a: [u16; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let e: [u16x4; 4] = [u16x4::new(1, 2, 3, 4), u16x4::new(5, 6, 7, 8), u16x4::new(9, 10, 11, 12), u16x4::new(13, 14, 15, 16)];
+        let r: [u16x4; 4] = transmute(vld1_u16_x4(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vrhadd_s8() {
-        let a: i8x8 = i8x8::new(42, 42, 42, 42, 42, 42, 42, 42);
-        let b: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let e: i8x8 = i8x8::new(22, 22, 23, 23, 24, 24, 25, 25);
-        let r: i8x8 = transmute(vrhadd_s8(transmute(a), transmute(b)));
+    unsafe fn test_vld1_u32_x4() {
+        let a: [u32; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+        let e: [u32x2; 4] = [u32x2::new(1, 2), u32x2::new(3, 4), u32x2::new(5, 6), u32x2::new(7, 8)];
+        let r: [u32x2; 4] = transmute(vld1_u32_x4(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vrhaddq_s8() {
-        let a: i8x16 = i8x16::new(42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42);
-        let b: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let e: i8x16 = i8x16::new(22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29);
-        let r: i8x16 = transmute(vrhaddq_s8(transmute(a), transmute(b)));
+    unsafe fn test_vld1_u64_x4() {
+        let a: [u64; 5] = [0, 1, 2, 3, 4];
+        let e: [u64x1; 4] = [u64x1::new(1), u64x1::new(2), u64x1::new(3), u64x1::new(4)];
+        let r: [u64x1; 4] = transmute(vld1_u64_x4(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vrhadd_s16() {
-        let a: i16x4 = i16x4::new(42, 42, 42, 42);
-        let b: i16x4 = i16x4::new(1, 2, 3, 4);
-        let e: i16x4 = i16x4::new(22, 22, 23, 23);
-        let r: i16x4 = transmute(vrhadd_s16(transmute(a), transmute(b)));
+    unsafe fn test_vld1q_u8_x4() {
+        let a: [u8; 65] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let e: [u8x16; 4] = [u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), u8x16::new(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32), u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), u8x16::new(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32)];
+        let r: [u8x16; 4] = transmute(vld1q_u8_x4(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vrhaddq_s16() {
-        let a: i16x8 = i16x8::new(42, 42, 42, 42, 42, 42, 42, 42);
-        let b: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let e: i16x8 = i16x8::new(22, 22, 23, 23, 24, 24, 25, 25);
-        let r: i16x8 = transmute(vrhaddq_s16(transmute(a), transmute(b)));
+    unsafe fn test_vld1q_u16_x4() {
+        let a: [u16; 33] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let e: [u16x8; 4] = [u16x8::new(1, 2, 3, 4, 5, 6, 7, 8), u16x8::new(9, 10, 11, 12, 13, 14, 15, 16), u16x8::new(17, 18, 19, 20, 21, 22, 23, 24), u16x8::new(25, 26, 27, 28, 29, 30, 31, 32)];
+        let r: [u16x8; 4] = transmute(vld1q_u16_x4(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vrhadd_s32() {
-        let a: i32x2 = i32x2::new(42, 42);
-        let b: i32x2 = i32x2::new(1, 2);
-        let e: i32x2 = i32x2::new(22, 22);
-        let r: i32x2 = transmute(vrhadd_s32(transmute(a), transmute(b)));
+    unsafe fn test_vld1q_u32_x4() {
+        let a: [u32; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let e: [u32x4; 4] = [u32x4::new(1, 2, 3, 4), u32x4::new(5, 6, 7, 8), u32x4::new(9, 10, 11, 12), u32x4::new(13, 14, 15, 16)];
+        let r: [u32x4; 4] = transmute(vld1q_u32_x4(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vrhaddq_s32() {
-        let a: i32x4 = i32x4::new(42, 42, 42, 42);
-        let b: i32x4 = i32x4::new(1, 2, 3, 4);
-        let e: i32x4 = i32x4::new(22, 22, 23, 23);
-        let r: i32x4 = transmute(vrhaddq_s32(transmute(a), transmute(b)));
+    unsafe fn test_vld1q_u64_x4() {
+        let a: [u64; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+        let e: [u64x2; 4] = [u64x2::new(1, 2), u64x2::new(3, 4), u64x2::new(5, 6), u64x2::new(7, 8)];
+        let r: [u64x2; 4] = transmute(vld1q_u64_x4(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vrndn_f32() {
-        let a: f32x2 = f32x2::new(-1.5, 0.5);
-        let e: f32x2 = f32x2::new(-2.0, 0.0);
-        let r: f32x2 = transmute(vrndn_f32(transmute(a)));
+    unsafe fn test_vld1_p8_x2() {
+        let a: [u8; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let e: [i8x8; 2] = [i8x8::new(1, 2, 3, 4, 5, 6, 7, 8), i8x8::new(9, 10, 11, 12, 13, 14, 15, 16)];
+        let r: [i8x8; 2] = transmute(vld1_p8_x2(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vrndnq_f32() {
-        let a: f32x4 = f32x4::new(-1.5, 0.5, 1.5, 2.5);
-        let e: f32x4 = f32x4::new(-2.0, 0.0, 2.0, 2.0);
-        let r: f32x4 = transmute(vrndnq_f32(transmute(a)));
+    unsafe fn test_vld1_p8_x3() {
+        let a: [u8; 25] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24];
+        let e: [i8x8; 3] = [i8x8::new(1, 2, 3, 4, 5, 6, 7, 8), i8x8::new(9, 10, 11, 12, 13, 14, 15, 16), i8x8::new(17, 18, 19, 20, 21, 22, 23, 24)];
+        let r: [i8x8; 3] = transmute(vld1_p8_x3(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vqadd_u8() {
-        let a: u8x8 = u8x8::new(42, 42, 42, 42, 42, 42, 42, 42);
-        let b: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let e: u8x8 = u8x8::new(43, 44, 45, 46, 47, 48, 49, 50);
-        let r: u8x8 = transmute(vqadd_u8(transmute(a), transmute(b)));
+    unsafe fn test_vld1_p8_x4() {
+        let a: [u8; 33] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let e: [i8x8; 4] = [i8x8::new(1, 2, 3, 4, 5, 6, 7, 8), i8x8::new(9, 10, 11, 12, 13, 14, 15, 16), i8x8::new(17, 18, 19, 20, 21, 22, 23, 24), i8x8::new(25, 26, 27, 28, 29, 30, 31, 32)];
+        let r: [i8x8; 4] = transmute(vld1_p8_x4(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vqaddq_u8() {
-        let a: u8x16 = u8x16::new(42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42);
-        let b: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let e: u8x16 = u8x16::new(43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58);
-        let r: u8x16 = transmute(vqaddq_u8(transmute(a), transmute(b)));
+    unsafe fn test_vld1q_p8_x2() {
+        let a: [u8; 33] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let e: [i8x16; 2] = [i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), i8x16::new(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32)];
+        let r: [i8x16; 2] = transmute(vld1q_p8_x2(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vqadd_u16() {
-        let a: u16x4 = u16x4::new(42, 42, 42, 42);
-        let b: u16x4 = u16x4::new(1, 2, 3, 4);
-        let e: u16x4 = u16x4::new(43, 44, 45, 46);
-        let r: u16x4 = transmute(vqadd_u16(transmute(a), transmute(b)));
+    unsafe fn test_vld1q_p8_x3() {
+        let a: [u8; 49] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let e: [i8x16; 3] = [i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), i8x16::new(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32), i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16)];
+        let r: [i8x16; 3] = transmute(vld1q_p8_x3(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vqaddq_u16() {
-        let a: u16x8 = u16x8::new(42, 42, 42, 42, 42, 42, 42, 42);
-        let b: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let e: u16x8 = u16x8::new(43, 44, 45, 46, 47, 48, 49, 50);
-        let r: u16x8 = transmute(vqaddq_u16(transmute(a), transmute(b)));
+    unsafe fn test_vld1q_p8_x4() {
+        let a: [u8; 65] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let e: [i8x16; 4] = [i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), i8x16::new(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32), i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), i8x16::new(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32)];
+        let r: [i8x16; 4] = transmute(vld1q_p8_x4(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vqadd_u32() {
-        let a: u32x2 = u32x2::new(42, 42);
-        let b: u32x2 = u32x2::new(1, 2);
-        let e: u32x2 = u32x2::new(43, 44);
-        let r: u32x2 = transmute(vqadd_u32(transmute(a), transmute(b)));
+    unsafe fn test_vld1_p16_x2() {
+        let a: [u16; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+        let e: [i16x4; 2] = [i16x4::new(1, 2, 3, 4), i16x4::new(5, 6, 7, 8)];
+        let r: [i16x4; 2] = transmute(vld1_p16_x2(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vqaddq_u32() {
-        let a: u32x4 = u32x4::new(42, 42, 42, 42);
-        let b: u32x4 = u32x4::new(1, 2, 3, 4);
-        let e: u32x4 = u32x4::new(43, 44, 45, 46);
-        let r: u32x4 = transmute(vqaddq_u32(transmute(a), transmute(b)));
+    unsafe fn test_vld1_p16_x3() {
+        let a: [u16; 13] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
+        let e: [i16x4; 3] = [i16x4::new(1, 2, 3, 4), i16x4::new(5, 6, 7, 8), i16x4::new(9, 10, 11, 12)];
+        let r: [i16x4; 3] = transmute(vld1_p16_x3(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vqadd_u64() {
-        let a: u64x1 = u64x1::new(42);
-        let b: u64x1 = u64x1::new(1);
-        let e: u64x1 = u64x1::new(43);
-        let r: u64x1 = transmute(vqadd_u64(transmute(a), transmute(b)));
+    unsafe fn test_vld1_p16_x4() {
+        let a: [u16; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let e: [i16x4; 4] = [i16x4::new(1, 2, 3, 4), i16x4::new(5, 6, 7, 8), i16x4::new(9, 10, 11, 12), i16x4::new(13, 14, 15, 16)];
+        let r: [i16x4; 4] = transmute(vld1_p16_x4(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vqaddq_u64() {
-        let a: u64x2 = u64x2::new(42, 42);
-        let b: u64x2 = u64x2::new(1, 2);
-        let e: u64x2 = u64x2::new(43, 44);
-        let r: u64x2 = transmute(vqaddq_u64(transmute(a), transmute(b)));
+    unsafe fn test_vld1q_p16_x2() {
+        let a: [u16; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let e: [i16x8; 2] = [i16x8::new(1, 2, 3, 4, 5, 6, 7, 8), i16x8::new(9, 10, 11, 12, 13, 14, 15, 16)];
+        let r: [i16x8; 2] = transmute(vld1q_p16_x2(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vqadd_s8() {
-        let a: i8x8 = i8x8::new(42, 42, 42, 42, 42, 42, 42, 42);
-        let b: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let e: i8x8 = i8x8::new(43, 44, 45, 46, 47, 48, 49, 50);
-        let r: i8x8 = transmute(vqadd_s8(transmute(a), transmute(b)));
+    unsafe fn test_vld1q_p16_x3() {
+        let a: [u16; 25] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24];
+        let e: [i16x8; 3] = [i16x8::new(1, 2, 3, 4, 5, 6, 7, 8), i16x8::new(9, 10, 11, 12, 13, 14, 15, 16), i16x8::new(17, 18, 19, 20, 21, 22, 23, 24)];
+        let r: [i16x8; 3] = transmute(vld1q_p16_x3(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vqaddq_s8() {
-        let a: i8x16 = i8x16::new(42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42);
-        let b: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let e: i8x16 = i8x16::new(43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58);
-        let r: i8x16 = transmute(vqaddq_s8(transmute(a), transmute(b)));
+    unsafe fn test_vld1q_p16_x4() {
+        let a: [u16; 33] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let e: [i16x8; 4] = [i16x8::new(1, 2, 3, 4, 5, 6, 7, 8), i16x8::new(9, 10, 11, 12, 13, 14, 15, 16), i16x8::new(17, 18, 19, 20, 21, 22, 23, 24), i16x8::new(25, 26, 27, 28, 29, 30, 31, 32)];
+        let r: [i16x8; 4] = transmute(vld1q_p16_x4(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vqadd_s16() {
-        let a: i16x4 = i16x4::new(42, 42, 42, 42);
-        let b: i16x4 = i16x4::new(1, 2, 3, 4);
-        let e: i16x4 = i16x4::new(43, 44, 45, 46);
-        let r: i16x4 = transmute(vqadd_s16(transmute(a), transmute(b)));
+    unsafe fn test_vld1_f32_x2() {
+        let a: [f32; 5] = [0., 1., 2., 3., 4.];
+        let e: [f32x2; 2] = [f32x2::new(1., 2.), f32x2::new(3., 4.)];
+        let r: [f32x2; 2] = transmute(vld1_f32_x2(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vqaddq_s16() {
-        let a: i16x8 = i16x8::new(42, 42, 42, 42, 42, 42, 42, 42);
-        let b: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let e: i16x8 = i16x8::new(43, 44, 45, 46, 47, 48, 49, 50);
-        let r: i16x8 = transmute(vqaddq_s16(transmute(a), transmute(b)));
+    unsafe fn test_vld1q_f32_x2() {
+        let a: [f32; 9] = [0., 1., 2., 3., 4., 5., 6., 7., 8.];
+        let e: [f32x4; 2] = [f32x4::new(1., 2., 3., 4.), f32x4::new(5., 6., 7., 8.)];
+        let r: [f32x4; 2] = transmute(vld1q_f32_x2(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vqadd_s32() {
-        let a: i32x2 = i32x2::new(42, 42);
-        let b: i32x2 = i32x2::new(1, 2);
-        let e: i32x2 = i32x2::new(43, 44);
-        let r: i32x2 = transmute(vqadd_s32(transmute(a), transmute(b)));
+    unsafe fn test_vld1_f32_x3() {
+        let a: [f32; 7] = [0., 1., 2., 3., 4., 5., 6.];
+        let e: [f32x2; 3] = [f32x2::new(1., 2.), f32x2::new(3., 4.), f32x2::new(5., 6.)];
+        let r: [f32x2; 3] = transmute(vld1_f32_x3(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vqaddq_s32() {
-        let a: i32x4 = i32x4::new(42, 42, 42, 42);
-        let b: i32x4 = i32x4::new(1, 2, 3, 4);
-        let e: i32x4 = i32x4::new(43, 44, 45, 46);
-        let r: i32x4 = transmute(vqaddq_s32(transmute(a), transmute(b)));
+    unsafe fn test_vld1q_f32_x3() {
+        let a: [f32; 13] = [0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12.];
+        let e: [f32x4; 3] = [f32x4::new(1., 2., 3., 4.), f32x4::new(5., 6., 7., 8.), f32x4::new(9., 10., 11., 12.)];
+        let r: [f32x4; 3] = transmute(vld1q_f32_x3(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vqadd_s64() {
-        let a: i64x1 = i64x1::new(42);
-        let b: i64x1 = i64x1::new(1);
-        let e: i64x1 = i64x1::new(43);
-        let r: i64x1 = transmute(vqadd_s64(transmute(a), transmute(b)));
+    unsafe fn test_vld1_f32_x4() {
+        let a: [f32; 9] = [0., 1., 2., 3., 4., 5., 6., 7., 8.];
+        let e: [f32x2; 4] = [f32x2::new(1., 2.), f32x2::new(3., 4.), f32x2::new(5., 6.), f32x2::new(7., 8.)];
+        let r: [f32x2; 4] = transmute(vld1_f32_x4(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vqaddq_s64() {
-        let a: i64x2 = i64x2::new(42, 42);
-        let b: i64x2 = i64x2::new(1, 2);
-        let e: i64x2 = i64x2::new(43, 44);
-        let r: i64x2 = transmute(vqaddq_s64(transmute(a), transmute(b)));
+    unsafe fn test_vld1q_f32_x4() {
+        let a: [f32; 17] = [0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.];
+        let e: [f32x4; 4] = [f32x4::new(1., 2., 3., 4.), f32x4::new(5., 6., 7., 8.), f32x4::new(9., 10., 11., 12.), f32x4::new(13., 14., 15., 16.)];
+        let r: [f32x4; 4] = transmute(vld1q_f32_x4(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_s8_x2() {
+    unsafe fn test_vst1_s8_x2() {
         let a: [i8; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
-        let e: [i8x8; 2] = [i8x8::new(1, 2, 3, 4, 5, 6, 7, 8), i8x8::new(9, 10, 11, 12, 13, 14, 15, 16)];
-        let r: [i8x8; 2] = transmute(vld1_s8_x2(a[1..].as_ptr()));
+        let e: [i8; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let mut r: [i8; 16] = [0i8; 16];
+        vst1_s8_x2(r.as_mut_ptr(), vld1_s8_x2(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_s16_x2() {
+    unsafe fn test_vst1_s16_x2() {
         let a: [i16; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
-        let e: [i16x4; 2] = [i16x4::new(1, 2, 3, 4), i16x4::new(5, 6, 7, 8)];
-        let r: [i16x4; 2] = transmute(vld1_s16_x2(a[1..].as_ptr()));
+        let e: [i16; 8] = [1, 2, 3, 4, 5, 6, 7, 8];
+        let mut r: [i16; 8] = [0i16; 8];
+        vst1_s16_x2(r.as_mut_ptr(), vld1_s16_x2(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_s32_x2() {
+    unsafe fn test_vst1_s32_x2() {
         let a: [i32; 5] = [0, 1, 2, 3, 4];
-        let e: [i32x2; 2] = [i32x2::new(1, 2), i32x2::new(3, 4)];
-        let r: [i32x2; 2] = transmute(vld1_s32_x2(a[1..].as_ptr()));
+        let e: [i32; 4] = [1, 2, 3, 4];
+        let mut r: [i32; 4] = [0i32; 4];
+        vst1_s32_x2(r.as_mut_ptr(), vld1_s32_x2(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_s64_x2() {
+    unsafe fn test_vst1_s64_x2() {
         let a: [i64; 3] = [0, 1, 2];
-        let e: [i64x1; 2] = [i64x1::new(1), i64x1::new(2)];
-        let r: [i64x1; 2] = transmute(vld1_s64_x2(a[1..].as_ptr()));
+        let e: [i64; 2] = [1, 2];
+        let mut r: [i64; 2] = [0i64; 2];
+        vst1_s64_x2(r.as_mut_ptr(), vld1_s64_x2(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_s8_x2() {
+    unsafe fn test_vst1q_s8_x2() {
         let a: [i8; 33] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
-        let e: [i8x16; 2] = [i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), i8x16::new(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32)];
-        let r: [i8x16; 2] = transmute(vld1q_s8_x2(a[1..].as_ptr()));
+        let e: [i8; 32] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let mut r: [i8; 32] = [0i8; 32];
+        vst1q_s8_x2(r.as_mut_ptr(), vld1q_s8_x2(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_s16_x2() {
+    unsafe fn test_vst1q_s16_x2() {
         let a: [i16; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
-        let e: [i16x8; 2] = [i16x8::new(1, 2, 3, 4, 5, 6, 7, 8), i16x8::new(9, 10, 11, 12, 13, 14, 15, 16)];
-        let r: [i16x8; 2] = transmute(vld1q_s16_x2(a[1..].as_ptr()));
+        let e: [i16; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let mut r: [i16; 16] = [0i16; 16];
+        vst1q_s16_x2(r.as_mut_ptr(), vld1q_s16_x2(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_s32_x2() {
+    unsafe fn test_vst1q_s32_x2() {
         let a: [i32; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
-        let e: [i32x4; 2] = [i32x4::new(1, 2, 3, 4), i32x4::new(5, 6, 7, 8)];
-        let r: [i32x4; 2] = transmute(vld1q_s32_x2(a[1..].as_ptr()));
+        let e: [i32; 8] = [1, 2, 3, 4, 5, 6, 7, 8];
+        let mut r: [i32; 8] = [0i32; 8];
+        vst1q_s32_x2(r.as_mut_ptr(), vld1q_s32_x2(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_s64_x2() {
+    unsafe fn test_vst1q_s64_x2() {
         let a: [i64; 5] = [0, 1, 2, 3, 4];
-        let e: [i64x2; 2] = [i64x2::new(1, 2), i64x2::new(3, 4)];
-        let r: [i64x2; 2] = transmute(vld1q_s64_x2(a[1..].as_ptr()));
+        let e: [i64; 4] = [1, 2, 3, 4];
+        let mut r: [i64; 4] = [0i64; 4];
+        vst1q_s64_x2(r.as_mut_ptr(), vld1q_s64_x2(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_s8_x3() {
+    unsafe fn test_vst1_s8_x3() {
         let a: [i8; 25] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24];
-        let e: [i8x8; 3] = [i8x8::new(1, 2, 3, 4, 5, 6, 7, 8), i8x8::new(9, 10, 11, 12, 13, 14, 15, 16), i8x8::new(17, 18, 19, 20, 21, 22, 23, 24)];
-        let r: [i8x8; 3] = transmute(vld1_s8_x3(a[1..].as_ptr()));
+        let e: [i8; 24] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24];
+        let mut r: [i8; 24] = [0i8; 24];
+        vst1_s8_x3(r.as_mut_ptr(), vld1_s8_x3(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_s16_x3() {
+    unsafe fn test_vst1_s16_x3() {
         let a: [i16; 13] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
-        let e: [i16x4; 3] = [i16x4::new(1, 2, 3, 4), i16x4::new(5, 6, 7, 8), i16x4::new(9, 10, 11, 12)];
-        let r: [i16x4; 3] = transmute(vld1_s16_x3(a[1..].as_ptr()));
+        let e: [i16; 12] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
+        let mut r: [i16; 12] = [0i16; 12];
+        vst1_s16_x3(r.as_mut_ptr(), vld1_s16_x3(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_s32_x3() {
+    unsafe fn test_vst1_s32_x3() {
         let a: [i32; 7] = [0, 1, 2, 3, 4, 5, 6];
-        let e: [i32x2; 3] = [i32x2::new(1, 2), i32x2::new(3, 4), i32x2::new(5, 6)];
-        let r: [i32x2; 3] = transmute(vld1_s32_x3(a[1..].as_ptr()));
+        let e: [i32; 6] = [1, 2, 3, 4, 5, 6];
+        let mut r: [i32; 6] = [0i32; 6];
+        vst1_s32_x3(r.as_mut_ptr(), vld1_s32_x3(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_s64_x3() {
+    unsafe fn test_vst1_s64_x3() {
         let a: [i64; 4] = [0, 1, 2, 3];
-        let e: [i64x1; 3] = [i64x1::new(1), i64x1::new(2), i64x1::new(3)];
-        let r: [i64x1; 3] = transmute(vld1_s64_x3(a[1..].as_ptr()));
+        let e: [i64; 3] = [1, 2, 3];
+        let mut r: [i64; 3] = [0i64; 3];
+        vst1_s64_x3(r.as_mut_ptr(), vld1_s64_x3(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_s8_x3() {
+    unsafe fn test_vst1q_s8_x3() {
         let a: [i8; 49] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
-        let e: [i8x16; 3] = [i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), i8x16::new(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32), i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16)];
-        let r: [i8x16; 3] = transmute(vld1q_s8_x3(a[1..].as_ptr()));
+        let e: [i8; 48] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let mut r: [i8; 48] = [0i8; 48];
+        vst1q_s8_x3(r.as_mut_ptr(), vld1q_s8_x3(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_s16_x3() {
+    unsafe fn test_vst1q_s16_x3() {
         let a: [i16; 25] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24];
-        let e: [i16x8; 3] = [i16x8::new(1, 2, 3, 4, 5, 6, 7, 8), i16x8::new(9, 10, 11, 12, 13, 14, 15, 16), i16x8::new(17, 18, 19, 20, 21, 22, 23, 24)];
-        let r: [i16x8; 3] = transmute(vld1q_s16_x3(a[1..].as_ptr()));
+        let e: [i16; 24] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24];
+        let mut r: [i16; 24] = [0i16; 24];
+        vst1q_s16_x3(r.as_mut_ptr(), vld1q_s16_x3(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_s32_x3() {
+    unsafe fn test_vst1q_s32_x3() {
         let a: [i32; 13] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
-        let e: [i32x4; 3] = [i32x4::new(1, 2, 3, 4), i32x4::new(5, 6, 7, 8), i32x4::new(9, 10, 11, 12)];
-        let r: [i32x4; 3] = transmute(vld1q_s32_x3(a[1..].as_ptr()));
+        let e: [i32; 12] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
+        let mut r: [i32; 12] = [0i32; 12];
+        vst1q_s32_x3(r.as_mut_ptr(), vld1q_s32_x3(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_s64_x3() {
+    unsafe fn test_vst1q_s64_x3() {
         let a: [i64; 7] = [0, 1, 2, 3, 4, 5, 6];
-        let e: [i64x2; 3] = [i64x2::new(1, 2), i64x2::new(3, 4), i64x2::new(5, 6)];
-        let r: [i64x2; 3] = transmute(vld1q_s64_x3(a[1..].as_ptr()));
+        let e: [i64; 6] = [1, 2, 3, 4, 5, 6];
+        let mut r: [i64; 6] = [0i64; 6];
+        vst1q_s64_x3(r.as_mut_ptr(), vld1q_s64_x3(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_s8_x4() {
+    unsafe fn test_vst1_s8_x4() {
         let a: [i8; 33] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
-        let e: [i8x8; 4] = [i8x8::new(1, 2, 3, 4, 5, 6, 7, 8), i8x8::new(9, 10, 11, 12, 13, 14, 15, 16), i8x8::new(17, 18, 19, 20, 21, 22, 23, 24), i8x8::new(25, 26, 27, 28, 29, 30, 31, 32)];
-        let r: [i8x8; 4] = transmute(vld1_s8_x4(a[1..].as_ptr()));
+        let e: [i8; 32] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let mut r: [i8; 32] = [0i8; 32];
+        vst1_s8_x4(r.as_mut_ptr(), vld1_s8_x4(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_s16_x4() {
+    unsafe fn test_vst1_s16_x4() {
         let a: [i16; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
-        let e: [i16x4; 4] = [i16x4::new(1, 2, 3, 4), i16x4::new(5, 6, 7, 8), i16x4::new(9, 10, 11, 12), i16x4::new(13, 14, 15, 16)];
-        let r: [i16x4; 4] = transmute(vld1_s16_x4(a[1..].as_ptr()));
+        let e: [i16; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let mut r: [i16; 16] = [0i16; 16];
+        vst1_s16_x4(r.as_mut_ptr(), vld1_s16_x4(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_s32_x4() {
+    unsafe fn test_vst1_s32_x4() {
         let a: [i32; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
-        let e: [i32x2; 4] = [i32x2::new(1, 2), i32x2::new(3, 4), i32x2::new(5, 6), i32x2::new(7, 8)];
-        let r: [i32x2; 4] = transmute(vld1_s32_x4(a[1..].as_ptr()));
+        let e: [i32; 8] = [1, 2, 3, 4, 5, 6, 7, 8];
+        let mut r: [i32; 8] = [0i32; 8];
+        vst1_s32_x4(r.as_mut_ptr(), vld1_s32_x4(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_s64_x4() {
+    unsafe fn test_vst1_s64_x4() {
         let a: [i64; 5] = [0, 1, 2, 3, 4];
-        let e: [i64x1; 4] = [i64x1::new(1), i64x1::new(2), i64x1::new(3), i64x1::new(4)];
-        let r: [i64x1; 4] = transmute(vld1_s64_x4(a[1..].as_ptr()));
+        let e: [i64; 4] = [1, 2, 3, 4];
+        let mut r: [i64; 4] = [0i64; 4];
+        vst1_s64_x4(r.as_mut_ptr(), vld1_s64_x4(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_s8_x4() {
+    unsafe fn test_vst1q_s8_x4() {
         let a: [i8; 65] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
-        let e: [i8x16; 4] = [i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), i8x16::new(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32), i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), i8x16::new(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32)];
-        let r: [i8x16; 4] = transmute(vld1q_s8_x4(a[1..].as_ptr()));
+        let e: [i8; 64] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let mut r: [i8; 64] = [0i8; 64];
+        vst1q_s8_x4(r.as_mut_ptr(), vld1q_s8_x4(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_s16_x4() {
+    unsafe fn test_vst1q_s16_x4() {
         let a: [i16; 33] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
-        let e: [i16x8; 4] = [i16x8::new(1, 2, 3, 4, 5, 6, 7, 8), i16x8::new(9, 10, 11, 12, 13, 14, 15, 16), i16x8::new(17, 18, 19, 20, 21, 22, 23, 24), i16x8::new(25, 26, 27, 28, 29, 30, 31, 32)];
-        let r: [i16x8; 4] = transmute(vld1q_s16_x4(a[1..].as_ptr()));
+        let e: [i16; 32] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let mut r: [i16; 32] = [0i16; 32];
+        vst1q_s16_x4(r.as_mut_ptr(), vld1q_s16_x4(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_s32_x4() {
+    unsafe fn test_vst1q_s32_x4() {
         let a: [i32; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
-        let e: [i32x4; 4] = [i32x4::new(1, 2, 3, 4), i32x4::new(5, 6, 7, 8), i32x4::new(9, 10, 11, 12), i32x4::new(13, 14, 15, 16)];
-        let r: [i32x4; 4] = transmute(vld1q_s32_x4(a[1..].as_ptr()));
+        let e: [i32; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let mut r: [i32; 16] = [0i32; 16];
+        vst1q_s32_x4(r.as_mut_ptr(), vld1q_s32_x4(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_s64_x4() {
+    unsafe fn test_vst1q_s64_x4() {
         let a: [i64; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
-        let e: [i64x2; 4] = [i64x2::new(1, 2), i64x2::new(3, 4), i64x2::new(5, 6), i64x2::new(7, 8)];
-        let r: [i64x2; 4] = transmute(vld1q_s64_x4(a[1..].as_ptr()));
+        let e: [i64; 8] = [1, 2, 3, 4, 5, 6, 7, 8];
+        let mut r: [i64; 8] = [0i64; 8];
+        vst1q_s64_x4(r.as_mut_ptr(), vld1q_s64_x4(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_u8_x2() {
+    unsafe fn test_vst1_u8_x2() {
         let a: [u8; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
-        let e: [u8x8; 2] = [u8x8::new(1, 2, 3, 4, 5, 6, 7, 8), u8x8::new(9, 10, 11, 12, 13, 14, 15, 16)];
-        let r: [u8x8; 2] = transmute(vld1_u8_x2(a[1..].as_ptr()));
+        let e: [u8; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let mut r: [u8; 16] = [0u8; 16];
+        vst1_u8_x2(r.as_mut_ptr(), vld1_u8_x2(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_u16_x2() {
+    unsafe fn test_vst1_u16_x2() {
         let a: [u16; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
-        let e: [u16x4; 2] = [u16x4::new(1, 2, 3, 4), u16x4::new(5, 6, 7, 8)];
-        let r: [u16x4; 2] = transmute(vld1_u16_x2(a[1..].as_ptr()));
+        let e: [u16; 8] = [1, 2, 3, 4, 5, 6, 7, 8];
+        let mut r: [u16; 8] = [0u16; 8];
+        vst1_u16_x2(r.as_mut_ptr(), vld1_u16_x2(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_u32_x2() {
+    unsafe fn test_vst1_u32_x2() {
         let a: [u32; 5] = [0, 1, 2, 3, 4];
-        let e: [u32x2; 2] = [u32x2::new(1, 2), u32x2::new(3, 4)];
-        let r: [u32x2; 2] = transmute(vld1_u32_x2(a[1..].as_ptr()));
+        let e: [u32; 4] = [1, 2, 3, 4];
+        let mut r: [u32; 4] = [0u32; 4];
+        vst1_u32_x2(r.as_mut_ptr(), vld1_u32_x2(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_u64_x2() {
+    unsafe fn test_vst1_u64_x2() {
         let a: [u64; 3] = [0, 1, 2];
-        let e: [u64x1; 2] = [u64x1::new(1), u64x1::new(2)];
-        let r: [u64x1; 2] = transmute(vld1_u64_x2(a[1..].as_ptr()));
+        let e: [u64; 2] = [1, 2];
+        let mut r: [u64; 2] = [0u64; 2];
+        vst1_u64_x2(r.as_mut_ptr(), vld1_u64_x2(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_u8_x2() {
+    unsafe fn test_vst1q_u8_x2() {
         let a: [u8; 33] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
-        let e: [u8x16; 2] = [u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), u8x16::new(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32)];
-        let r: [u8x16; 2] = transmute(vld1q_u8_x2(a[1..].as_ptr()));
+        let e: [u8; 32] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let mut r: [u8; 32] = [0u8; 32];
+        vst1q_u8_x2(r.as_mut_ptr(), vld1q_u8_x2(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_u16_x2() {
+    unsafe fn test_vst1q_u16_x2() {
         let a: [u16; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
-        let e: [u16x8; 2] = [u16x8::new(1, 2, 3, 4, 5, 6, 7, 8), u16x8::new(9, 10, 11, 12, 13, 14, 15, 16)];
-        let r: [u16x8; 2] = transmute(vld1q_u16_x2(a[1..].as_ptr()));
+        let e: [u16; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let mut r: [u16; 16] = [0u16; 16];
+        vst1q_u16_x2(r.as_mut_ptr(), vld1q_u16_x2(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_u32_x2() {
+    unsafe fn test_vst1q_u32_x2() {
         let a: [u32; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
-        let e: [u32x4; 2] = [u32x4::new(1, 2, 3, 4), u32x4::new(5, 6, 7, 8)];
-        let r: [u32x4; 2] = transmute(vld1q_u32_x2(a[1..].as_ptr()));
+        let e: [u32; 8] = [1, 2, 3, 4, 5, 6, 7, 8];
+        let mut r: [u32; 8] = [0u32; 8];
+        vst1q_u32_x2(r.as_mut_ptr(), vld1q_u32_x2(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_u64_x2() {
+    unsafe fn test_vst1q_u64_x2() {
         let a: [u64; 5] = [0, 1, 2, 3, 4];
-        let e: [u64x2; 2] = [u64x2::new(1, 2), u64x2::new(3, 4)];
-        let r: [u64x2; 2] = transmute(vld1q_u64_x2(a[1..].as_ptr()));
+        let e: [u64; 4] = [1, 2, 3, 4];
+        let mut r: [u64; 4] = [0u64; 4];
+        vst1q_u64_x2(r.as_mut_ptr(), vld1q_u64_x2(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_u8_x3() {
+    unsafe fn test_vst1_u8_x3() {
         let a: [u8; 25] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24];
-        let e: [u8x8; 3] = [u8x8::new(1, 2, 3, 4, 5, 6, 7, 8), u8x8::new(9, 10, 11, 12, 13, 14, 15, 16), u8x8::new(17, 18, 19, 20, 21, 22, 23, 24)];
-        let r: [u8x8; 3] = transmute(vld1_u8_x3(a[1..].as_ptr()));
+        let e: [u8; 24] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24];
+        let mut r: [u8; 24] = [0u8; 24];
+        vst1_u8_x3(r.as_mut_ptr(), vld1_u8_x3(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_u16_x3() {
+    unsafe fn test_vst1_u16_x3() {
         let a: [u16; 13] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
-        let e: [u16x4; 3] = [u16x4::new(1, 2, 3, 4), u16x4::new(5, 6, 7, 8), u16x4::new(9, 10, 11, 12)];
-        let r: [u16x4; 3] = transmute(vld1_u16_x3(a[1..].as_ptr()));
+        let e: [u16; 12] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
+        let mut r: [u16; 12] = [0u16; 12];
+        vst1_u16_x3(r.as_mut_ptr(), vld1_u16_x3(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_u32_x3() {
+    unsafe fn test_vst1_u32_x3() {
         let a: [u32; 7] = [0, 1, 2, 3, 4, 5, 6];
-        let e: [u32x2; 3] = [u32x2::new(1, 2), u32x2::new(3, 4), u32x2::new(5, 6)];
-        let r: [u32x2; 3] = transmute(vld1_u32_x3(a[1..].as_ptr()));
+        let e: [u32; 6] = [1, 2, 3, 4, 5, 6];
+        let mut r: [u32; 6] = [0u32; 6];
+        vst1_u32_x3(r.as_mut_ptr(), vld1_u32_x3(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_u64_x3() {
+    unsafe fn test_vst1_u64_x3() {
         let a: [u64; 4] = [0, 1, 2, 3];
-        let e: [u64x1; 3] = [u64x1::new(1), u64x1::new(2), u64x1::new(3)];
-        let r: [u64x1; 3] = transmute(vld1_u64_x3(a[1..].as_ptr()));
+        let e: [u64; 3] = [1, 2, 3];
+        let mut r: [u64; 3] = [0u64; 3];
+        vst1_u64_x3(r.as_mut_ptr(), vld1_u64_x3(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_u8_x3() {
+    unsafe fn test_vst1q_u8_x3() {
         let a: [u8; 49] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
-        let e: [u8x16; 3] = [u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), u8x16::new(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32), u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16)];
-        let r: [u8x16; 3] = transmute(vld1q_u8_x3(a[1..].as_ptr()));
+        let e: [u8; 48] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let mut r: [u8; 48] = [0u8; 48];
+        vst1q_u8_x3(r.as_mut_ptr(), vld1q_u8_x3(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_u16_x3() {
+    unsafe fn test_vst1q_u16_x3() {
         let a: [u16; 25] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24];
-        let e: [u16x8; 3] = [u16x8::new(1, 2, 3, 4, 5, 6, 7, 8), u16x8::new(9, 10, 11, 12, 13, 14, 15, 16), u16x8::new(17, 18, 19, 20, 21, 22, 23, 24)];
-        let r: [u16x8; 3] = transmute(vld1q_u16_x3(a[1..].as_ptr()));
+        let e: [u16; 24] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24];
+        let mut r: [u16; 24] = [0u16; 24];
+        vst1q_u16_x3(r.as_mut_ptr(), vld1q_u16_x3(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_u32_x3() {
+    unsafe fn test_vst1q_u32_x3() {
         let a: [u32; 13] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
-        let e: [u32x4; 3] = [u32x4::new(1, 2, 3, 4), u32x4::new(5, 6, 7, 8), u32x4::new(9, 10, 11, 12)];
-        let r: [u32x4; 3] = transmute(vld1q_u32_x3(a[1..].as_ptr()));
+        let e: [u32; 12] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
+        let mut r: [u32; 12] = [0u32; 12];
+        vst1q_u32_x3(r.as_mut_ptr(), vld1q_u32_x3(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_u64_x3() {
+    unsafe fn test_vst1q_u64_x3() {
         let a: [u64; 7] = [0, 1, 2, 3, 4, 5, 6];
-        let e: [u64x2; 3] = [u64x2::new(1, 2), u64x2::new(3, 4), u64x2::new(5, 6)];
-        let r: [u64x2; 3] = transmute(vld1q_u64_x3(a[1..].as_ptr()));
+        let e: [u64; 6] = [1, 2, 3, 4, 5, 6];
+        let mut r: [u64; 6] = [0u64; 6];
+        vst1q_u64_x3(r.as_mut_ptr(), vld1q_u64_x3(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_u8_x4() {
+    unsafe fn test_vst1_u8_x4() {
         let a: [u8; 33] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
-        let e: [u8x8; 4] = [u8x8::new(1, 2, 3, 4, 5, 6, 7, 8), u8x8::new(9, 10, 11, 12, 13, 14, 15, 16), u8x8::new(17, 18, 19, 20, 21, 22, 23, 24), u8x8::new(25, 26, 27, 28, 29, 30, 31, 32)];
-        let r: [u8x8; 4] = transmute(vld1_u8_x4(a[1..].as_ptr()));
+        let e: [u8; 32] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let mut r: [u8; 32] = [0u8; 32];
+        vst1_u8_x4(r.as_mut_ptr(), vld1_u8_x4(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_u16_x4() {
+    unsafe fn test_vst1_u16_x4() {
         let a: [u16; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
-        let e: [u16x4; 4] = [u16x4::new(1, 2, 3, 4), u16x4::new(5, 6, 7, 8), u16x4::new(9, 10, 11, 12), u16x4::new(13, 14, 15, 16)];
-        let r: [u16x4; 4] = transmute(vld1_u16_x4(a[1..].as_ptr()));
+        let e: [u16; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let mut r: [u16; 16] = [0u16; 16];
+        vst1_u16_x4(r.as_mut_ptr(), vld1_u16_x4(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_u32_x4() {
+    unsafe fn test_vst1_u32_x4() {
         let a: [u32; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
-        let e: [u32x2; 4] = [u32x2::new(1, 2), u32x2::new(3, 4), u32x2::new(5, 6), u32x2::new(7, 8)];
-        let r: [u32x2; 4] = transmute(vld1_u32_x4(a[1..].as_ptr()));
+        let e: [u32; 8] = [1, 2, 3, 4, 5, 6, 7, 8];
+        let mut r: [u32; 8] = [0u32; 8];
+        vst1_u32_x4(r.as_mut_ptr(), vld1_u32_x4(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_u64_x4() {
+    unsafe fn test_vst1_u64_x4() {
         let a: [u64; 5] = [0, 1, 2, 3, 4];
-        let e: [u64x1; 4] = [u64x1::new(1), u64x1::new(2), u64x1::new(3), u64x1::new(4)];
-        let r: [u64x1; 4] = transmute(vld1_u64_x4(a[1..].as_ptr()));
+        let e: [u64; 4] = [1, 2, 3, 4];
+        let mut r: [u64; 4] = [0u64; 4];
+        vst1_u64_x4(r.as_mut_ptr(), vld1_u64_x4(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_u8_x4() {
+    unsafe fn test_vst1q_u8_x4() {
         let a: [u8; 65] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
-        let e: [u8x16; 4] = [u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), u8x16::new(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32), u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), u8x16::new(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32)];
-        let r: [u8x16; 4] = transmute(vld1q_u8_x4(a[1..].as_ptr()));
+        let e: [u8; 64] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let mut r: [u8; 64] = [0u8; 64];
+        vst1q_u8_x4(r.as_mut_ptr(), vld1q_u8_x4(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_u16_x4() {
+    unsafe fn test_vst1q_u16_x4() {
         let a: [u16; 33] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
-        let e: [u16x8; 4] = [u16x8::new(1, 2, 3, 4, 5, 6, 7, 8), u16x8::new(9, 10, 11, 12, 13, 14, 15, 16), u16x8::new(17, 18, 19, 20, 21, 22, 23, 24), u16x8::new(25, 26, 27, 28, 29, 30, 31, 32)];
-        let r: [u16x8; 4] = transmute(vld1q_u16_x4(a[1..].as_ptr()));
+        let e: [u16; 32] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let mut r: [u16; 32] = [0u16; 32];
+        vst1q_u16_x4(r.as_mut_ptr(), vld1q_u16_x4(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_u32_x4() {
+    unsafe fn test_vst1q_u32_x4() {
         let a: [u32; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
-        let e: [u32x4; 4] = [u32x4::new(1, 2, 3, 4), u32x4::new(5, 6, 7, 8), u32x4::new(9, 10, 11, 12), u32x4::new(13, 14, 15, 16)];
-        let r: [u32x4; 4] = transmute(vld1q_u32_x4(a[1..].as_ptr()));
+        let e: [u32; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let mut r: [u32; 16] = [0u32; 16];
+        vst1q_u32_x4(r.as_mut_ptr(), vld1q_u32_x4(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_u64_x4() {
+    unsafe fn test_vst1q_u64_x4() {
         let a: [u64; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
-        let e: [u64x2; 4] = [u64x2::new(1, 2), u64x2::new(3, 4), u64x2::new(5, 6), u64x2::new(7, 8)];
-        let r: [u64x2; 4] = transmute(vld1q_u64_x4(a[1..].as_ptr()));
+        let e: [u64; 8] = [1, 2, 3, 4, 5, 6, 7, 8];
+        let mut r: [u64; 8] = [0u64; 8];
+        vst1q_u64_x4(r.as_mut_ptr(), vld1q_u64_x4(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_p8_x2() {
+    unsafe fn test_vst1_p8_x2() {
         let a: [u8; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
-        let e: [i8x8; 2] = [i8x8::new(1, 2, 3, 4, 5, 6, 7, 8), i8x8::new(9, 10, 11, 12, 13, 14, 15, 16)];
-        let r: [i8x8; 2] = transmute(vld1_p8_x2(a[1..].as_ptr()));
+        let e: [u8; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let mut r: [u8; 16] = [0u8; 16];
+        vst1_p8_x2(r.as_mut_ptr(), vld1_p8_x2(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_p8_x3() {
+    unsafe fn test_vst1_p8_x3() {
         let a: [u8; 25] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24];
-        let e: [i8x8; 3] = [i8x8::new(1, 2, 3, 4, 5, 6, 7, 8), i8x8::new(9, 10, 11, 12, 13, 14, 15, 16), i8x8::new(17, 18, 19, 20, 21, 22, 23, 24)];
-        let r: [i8x8; 3] = transmute(vld1_p8_x3(a[1..].as_ptr()));
+        let e: [u8; 24] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24];
+        let mut r: [u8; 24] = [0u8; 24];
+        vst1_p8_x3(r.as_mut_ptr(), vld1_p8_x3(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_p8_x4() {
+    unsafe fn test_vst1_p8_x4() {
         let a: [u8; 33] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
-        let e: [i8x8; 4] = [i8x8::new(1, 2, 3, 4, 5, 6, 7, 8), i8x8::new(9, 10, 11, 12, 13, 14, 15, 16), i8x8::new(17, 18, 19, 20, 21, 22, 23, 24), i8x8::new(25, 26, 27, 28, 29, 30, 31, 32)];
-        let r: [i8x8; 4] = transmute(vld1_p8_x4(a[1..].as_ptr()));
+        let e: [u8; 32] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let mut r: [u8; 32] = [0u8; 32];
+        vst1_p8_x4(r.as_mut_ptr(), vld1_p8_x4(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_p8_x2() {
+    unsafe fn test_vst1q_p8_x2() {
         let a: [u8; 33] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
-        let e: [i8x16; 2] = [i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), i8x16::new(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32)];
-        let r: [i8x16; 2] = transmute(vld1q_p8_x2(a[1..].as_ptr()));
+        let e: [u8; 32] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let mut r: [u8; 32] = [0u8; 32];
+        vst1q_p8_x2(r.as_mut_ptr(), vld1q_p8_x2(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_p8_x3() {
+    unsafe fn test_vst1q_p8_x3() {
         let a: [u8; 49] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
-        let e: [i8x16; 3] = [i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), i8x16::new(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32), i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16)];
-        let r: [i8x16; 3] = transmute(vld1q_p8_x3(a[1..].as_ptr()));
+        let e: [u8; 48] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let mut r: [u8; 48] = [0u8; 48];
+        vst1q_p8_x3(r.as_mut_ptr(), vld1q_p8_x3(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_p8_x4() {
+    unsafe fn test_vst1q_p8_x4() {
         let a: [u8; 65] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
-        let e: [i8x16; 4] = [i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), i8x16::new(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32), i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), i8x16::new(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32)];
-        let r: [i8x16; 4] = transmute(vld1q_p8_x4(a[1..].as_ptr()));
+        let e: [u8; 64] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let mut r: [u8; 64] = [0u8; 64];
+        vst1q_p8_x4(r.as_mut_ptr(), vld1q_p8_x4(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_p16_x2() {
+    unsafe fn test_vst1_p16_x2() {
         let a: [u16; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
-        let e: [i16x4; 2] = [i16x4::new(1, 2, 3, 4), i16x4::new(5, 6, 7, 8)];
-        let r: [i16x4; 2] = transmute(vld1_p16_x2(a[1..].as_ptr()));
+        let e: [u16; 8] = [1, 2, 3, 4, 5, 6, 7, 8];
+        let mut r: [u16; 8] = [0u16; 8];
+        vst1_p16_x2(r.as_mut_ptr(), vld1_p16_x2(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_p16_x3() {
+    unsafe fn test_vst1_p16_x3() {
         let a: [u16; 13] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
-        let e: [i16x4; 3] = [i16x4::new(1, 2, 3, 4), i16x4::new(5, 6, 7, 8), i16x4::new(9, 10, 11, 12)];
-        let r: [i16x4; 3] = transmute(vld1_p16_x3(a[1..].as_ptr()));
+        let e: [u16; 12] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
+        let mut r: [u16; 12] = [0u16; 12];
+        vst1_p16_x3(r.as_mut_ptr(), vld1_p16_x3(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_p16_x4() {
+    unsafe fn test_vst1_p16_x4() {
         let a: [u16; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
-        let e: [i16x4; 4] = [i16x4::new(1, 2, 3, 4), i16x4::new(5, 6, 7, 8), i16x4::new(9, 10, 11, 12), i16x4::new(13, 14, 15, 16)];
-        let r: [i16x4; 4] = transmute(vld1_p16_x4(a[1..].as_ptr()));
+        let e: [u16; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let mut r: [u16; 16] = [0u16; 16];
+        vst1_p16_x4(r.as_mut_ptr(), vld1_p16_x4(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_p16_x2() {
+    unsafe fn test_vst1q_p16_x2() {
         let a: [u16; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
-        let e: [i16x8; 2] = [i16x8::new(1, 2, 3, 4, 5, 6, 7, 8), i16x8::new(9, 10, 11, 12, 13, 14, 15, 16)];
-        let r: [i16x8; 2] = transmute(vld1q_p16_x2(a[1..].as_ptr()));
+        let e: [u16; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let mut r: [u16; 16] = [0u16; 16];
+        vst1q_p16_x2(r.as_mut_ptr(), vld1q_p16_x2(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_p16_x3() {
+    unsafe fn test_vst1q_p16_x3() {
         let a: [u16; 25] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24];
-        let e: [i16x8; 3] = [i16x8::new(1, 2, 3, 4, 5, 6, 7, 8), i16x8::new(9, 10, 11, 12, 13, 14, 15, 16), i16x8::new(17, 18, 19, 20, 21, 22, 23, 24)];
-        let r: [i16x8; 3] = transmute(vld1q_p16_x3(a[1..].as_ptr()));
+        let e: [u16; 24] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24];
+        let mut r: [u16; 24] = [0u16; 24];
+        vst1q_p16_x3(r.as_mut_ptr(), vld1q_p16_x3(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_p16_x4() {
+    unsafe fn test_vst1q_p16_x4() {
         let a: [u16; 33] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
-        let e: [i16x8; 4] = [i16x8::new(1, 2, 3, 4, 5, 6, 7, 8), i16x8::new(9, 10, 11, 12, 13, 14, 15, 16), i16x8::new(17, 18, 19, 20, 21, 22, 23, 24), i16x8::new(25, 26, 27, 28, 29, 30, 31, 32)];
-        let r: [i16x8; 4] = transmute(vld1q_p16_x4(a[1..].as_ptr()));
+        let e: [u16; 32] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let mut r: [u16; 32] = [0u16; 32];
+        vst1q_p16_x4(r.as_mut_ptr(), vld1q_p16_x4(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_f32_x2() {
+    unsafe fn test_vst1_f32_x2() {
         let a: [f32; 5] = [0., 1., 2., 3., 4.];
-        let e: [f32x2; 2] = [f32x2::new(1., 2.), f32x2::new(3., 4.)];
-        let r: [f32x2; 2] = transmute(vld1_f32_x2(a[1..].as_ptr()));
+        let e: [f32; 4] = [1., 2., 3., 4.];
+        let mut r: [f32; 4] = [0f32; 4];
+        vst1_f32_x2(r.as_mut_ptr(), vld1_f32_x2(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_f32_x2() {
+    unsafe fn test_vst1q_f32_x2() {
         let a: [f32; 9] = [0., 1., 2., 3., 4., 5., 6., 7., 8.];
-        let e: [f32x4; 2] = [f32x4::new(1., 2., 3., 4.), f32x4::new(5., 6., 7., 8.)];
-        let r: [f32x4; 2] = transmute(vld1q_f32_x2(a[1..].as_ptr()));
+        let e: [f32; 8] = [1., 2., 3., 4., 5., 6., 7., 8.];
+        let mut r: [f32; 8] = [0f32; 8];
+        vst1q_f32_x2(r.as_mut_ptr(), vld1q_f32_x2(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_f32_x3() {
+    unsafe fn test_vst1_f32_x3() {
         let a: [f32; 7] = [0., 1., 2., 3., 4., 5., 6.];
-        let e: [f32x2; 3] = [f32x2::new(1., 2.), f32x2::new(3., 4.), f32x2::new(5., 6.)];
-        let r: [f32x2; 3] = transmute(vld1_f32_x3(a[1..].as_ptr()));
+        let e: [f32; 6] = [1., 2., 3., 4., 5., 6.];
+        let mut r: [f32; 6] = [0f32; 6];
+        vst1_f32_x3(r.as_mut_ptr(), vld1_f32_x3(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_f32_x3() {
+    unsafe fn test_vst1q_f32_x3() {
         let a: [f32; 13] = [0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12.];
-        let e: [f32x4; 3] = [f32x4::new(1., 2., 3., 4.), f32x4::new(5., 6., 7., 8.), f32x4::new(9., 10., 11., 12.)];
-        let r: [f32x4; 3] = transmute(vld1q_f32_x3(a[1..].as_ptr()));
+        let e: [f32; 12] = [1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12.];
+        let mut r: [f32; 12] = [0f32; 12];
+        vst1q_f32_x3(r.as_mut_ptr(), vld1q_f32_x3(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_f32_x4() {
+    unsafe fn test_vst1_f32_x4() {
         let a: [f32; 9] = [0., 1., 2., 3., 4., 5., 6., 7., 8.];
-        let e: [f32x2; 4] = [f32x2::new(1., 2.), f32x2::new(3., 4.), f32x2::new(5., 6.), f32x2::new(7., 8.)];
-        let r: [f32x2; 4] = transmute(vld1_f32_x4(a[1..].as_ptr()));
+        let e: [f32; 8] = [1., 2., 3., 4., 5., 6., 7., 8.];
+        let mut r: [f32; 8] = [0f32; 8];
+        vst1_f32_x4(r.as_mut_ptr(), vld1_f32_x4(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_f32_x4() {
+    unsafe fn test_vst1q_f32_x4() {
         let a: [f32; 17] = [0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.];
-        let e: [f32x4; 4] = [f32x4::new(1., 2., 3., 4.), f32x4::new(5., 6., 7., 8.), f32x4::new(9., 10., 11., 12.), f32x4::new(13., 14., 15., 16.)];
-        let r: [f32x4; 4] = transmute(vld1q_f32_x4(a[1..].as_ptr()));
+        let e: [f32; 16] = [1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.];
+        let mut r: [f32; 16] = [0f32; 16];
+        vst1q_f32_x4(r.as_mut_ptr(), vld1q_f32_x4(a[1..].as_ptr()));
         assert_eq!(r, e);
     }
 
diff --git a/crates/stdarch-gen/neon.spec b/crates/stdarch-gen/neon.spec
index 2e2e3cee40..789a394885 100644
--- a/crates/stdarch-gen/neon.spec
+++ b/crates/stdarch-gen/neon.spec
@@ -955,6 +955,7 @@ multi_fn = static_assert-N-1-bits
 a = 1, 2, 3, 4
 n = 2
 validate 0.25, 0.5, 0.75, 1.
+arm-aarch64-separate
 
 aarch64 = scvtf
 link-aarch64 = vcvtfxs2fp._EXT2_._EXT_
@@ -971,6 +972,7 @@ link-aarch64 = vcvtfxs2fp._EXT2_._EXT_
 arm = vcvt
 link-arm = vcvtfxs2fp._EXT2_._EXT_
 const-arm = N:i32
+
 generate int32x2_t:float32x2_t, int32x4_t:float32x4_t
 
 aarch64 = ucvtf
@@ -988,6 +990,7 @@ multi_fn = static_assert-N-1-bits
 a = 0.25, 0.5, 0.75, 1.
 n = 2
 validate 1, 2, 3, 4
+arm-aarch64-separate
 
 aarch64 = fcvtzs
 link-aarch64 = vcvtfp2fxs._EXT2_._EXT_
@@ -2038,7 +2041,7 @@ name = vld1
 out-suffix
 a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
 validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
-test = load_test
+load_fn
 
 aarch64 = ld1
 link-aarch64 = ld1x2._EXT2_
@@ -2064,7 +2067,7 @@ multi_fn = transmute, {vld1-outsigned-noext, transmute(a)}
 a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
 validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
 
-test = load_test
+load_fn
 aarch64 = ld1
 arm = vld1
 generate *const u8:uint8x8x2_t, *const u16:uint16x4x2_t, *const u32:uint32x2x2_t, *const u64:uint64x1x2_t
@@ -2083,7 +2086,7 @@ name = vld1
 out-suffix
 a = 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.
 validate 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.
-test = load_test
+load_fn
 
 aarch64 = ld1
 link-aarch64 = ld1x2._EXT2_
@@ -2108,6 +2111,80 @@ link-aarch64 = ld1x4._EXT2_
 link-arm = vld1x4._EXT2_
 generate *const f32:float32x2x4_t, *const f32:float32x4x4_t
 
+/// Store multiple single-element structures from one, two, three, or four registers
+name = vst1
+a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
+validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
+store_fn
+arm-aarch64-separate
+
+aarch64 = st1
+link-aarch64 = st1x2._EXT3_
+arm = vst1
+link-arm = vst1x2._EXT3_
+generate *mut i8:int8x8x2_t:void, *mut i16:int16x4x2_t:void, *mut i32:int32x2x2_t:void, *mut i64:int64x1x2_t:void
+generate *mut i8:int8x16x2_t:void, *mut i16:int16x8x2_t:void, *mut i32:int32x4x2_t:void, *mut i64:int64x2x2_t:void
+
+link-aarch64 = st1x3._EXT3_
+link-arm = vst1x3._EXT3_
+generate *mut i8:int8x8x3_t:void, *mut i16:int16x4x3_t:void, *mut i32:int32x2x3_t:void, *mut i64:int64x1x3_t:void
+generate *mut i8:int8x16x3_t:void, *mut i16:int16x8x3_t:void, *mut i32:int32x4x3_t:void, *mut i64:int64x2x3_t:void
+
+link-aarch64 = st1x4._EXT3_
+link-arm = vst1x4._EXT3_
+generate *mut i8:int8x8x4_t:void, *mut i16:int16x4x4_t:void, *mut i32:int32x2x4_t:void, *mut i64:int64x1x4_t:void
+generate *mut i8:int8x16x4_t:void, *mut i16:int16x8x4_t:void, *mut i32:int32x4x4_t:void, *mut i64:int64x2x4_t:void
+
+/// Store multiple single-element structures to one, two, three, or four registers
+name = vst1
+multi_fn = vst1-signed-noext, transmute(a), transmute(b)
+a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
+validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
+
+store_fn
+aarch64 = st1
+arm = vst1
+generate *mut u8:uint8x8x2_t:void, *mut u16:uint16x4x2_t:void, *mut u32:uint32x2x2_t:void, *mut u64:uint64x1x2_t:void
+generate *mut u8:uint8x16x2_t:void, *mut u16:uint16x8x2_t:void, *mut u32:uint32x4x2_t:void, *mut u64:uint64x2x2_t:void
+generate *mut u8:uint8x8x3_t:void, *mut u16:uint16x4x3_t:void, *mut u32:uint32x2x3_t:void, *mut u64:uint64x1x3_t:void
+generate *mut u8:uint8x16x3_t:void, *mut u16:uint16x8x3_t:void, *mut u32:uint32x4x3_t:void, *mut u64:uint64x2x3_t:void
+generate *mut u8:uint8x8x4_t:void, *mut u16:uint16x4x4_t:void, *mut u32:uint32x2x4_t:void, *mut u64:uint64x1x4_t:void
+generate *mut u8:uint8x16x4_t:void, *mut u16:uint16x8x4_t:void, *mut u32:uint32x4x4_t:void, *mut u64:uint64x2x4_t:void
+generate *mut p8:poly8x8x2_t:void, *mut p8:poly8x8x3_t:void, *mut p8:poly8x8x4_t:void
+generate *mut p8:poly8x16x2_t:void, *mut p8:poly8x16x3_t:void, *mut p8:poly8x16x4_t:void
+generate *mut p16:poly16x4x2_t:void, *mut p16:poly16x4x3_t:void, *mut p16:poly16x4x4_t:void
+generate *mut p16:poly16x8x2_t:void, *mut p16:poly16x8x3_t:void, *mut p16:poly16x8x4_t:void
+
+/// Store multiple single-element structures to one, two, three, or four registers
+name = vst1
+a = 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.
+validate 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.
+store_fn
+arm-aarch64-separate
+
+aarch64 = st1
+link-aarch64 = st1x2._EXT3_
+generate *mut f64:float64x1x2_t:void, *mut f64:float64x2x2_t:void
+
+link-aarch64 = st1x3._EXT3_
+generate *mut f64:float64x1x3_t:void, *mut f64:float64x2x3_t:void
+
+link-aarch64 = st1x4._EXT3_
+generate *mut f64:float64x1x4_t:void, *mut f64:float64x2x4_t:void
+
+arm = vst1
+link-aarch64 = st1x2._EXT3_
+link-arm = vst1x2._EXT3_
+generate *mut f32:float32x2x2_t:void, *mut f32:float32x4x2_t:void
+
+link-aarch64 = st1x3._EXT3_
+link-arm = vst1x3._EXT3_
+generate *mut f32:float32x2x3_t:void, *mut f32:float32x4x3_t:void
+
+link-aarch64 = st1x4._EXT3_
+link-arm = vst1x4._EXT3_
+generate *mut f32:float32x2x4_t:void, *mut f32:float32x4x4_t:void
+
 /// Multiply
 name = vmul
 a = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2
@@ -3869,6 +3946,7 @@ const-aarch64 = N
 arm = vqrshrn
 link-arm = vqrshiftns._EXT2_
 const-arm = -N as ttn
+arm-aarch64-separate
 generate int16x8_t:int8x8_t, int32x4_t:int16x4_t, int64x2_t:int32x2_t
 
 /// Signed saturating rounded shift right narrow
@@ -3915,6 +3993,7 @@ const-aarch64 = N
 arm = vqrshrn
 link-arm = vqrshiftnu._EXT2_
 const-arm = -N as ttn
+arm-aarch64-separate
 generate uint16x8_t:uint8x8_t, uint32x4_t:uint16x4_t, uint64x2_t:uint32x2_t
 
 /// Unsigned saturating rounded shift right narrow
@@ -3961,6 +4040,7 @@ const-aarch64 = N
 arm = vqrshrun
 link-arm = vqrshiftnsu._EXT2_
 const-arm = -N as ttn
+arm-aarch64-separate
 generate int16x8_t:uint8x8_t, int32x4_t:uint16x4_t, int64x2_t:uint32x2_t
 
 /// Signed saturating rounded shift right unsigned narrow
@@ -4106,6 +4186,7 @@ multi_fn = static_assert-N-1-halfbits
 a = 0, 4, 8, 12, 16, 20, 24, 28
 n = 2
 validate 0, 1, 2, 3, 4, 5, 6, 7
+arm-aarch64-separate
 
 aarch64 = sqshrn
 link-aarch64 = sqshrn._EXT2_
@@ -4152,6 +4233,7 @@ multi_fn = static_assert-N-1-halfbits
 a = 0, 4, 8, 12, 16, 20, 24, 28
 n = 2
 validate 0, 1, 2, 3, 4, 5, 6, 7
+arm-aarch64-separate
 
 aarch64 = uqshrn
 link-aarch64 = uqshrn._EXT2_
@@ -4198,6 +4280,7 @@ multi_fn = static_assert-N-1-halfbits
 a = 0, 4, 8, 12, 16, 20, 24, 28
 n = 2
 validate 0, 1, 2, 3, 4, 5, 6, 7
+arm-aarch64-separate
 
 aarch64 = sqshrun
 link-aarch64 = sqshrun._EXT2_
@@ -4542,6 +4625,7 @@ multi_fn = static_assert-N-1-halfbits
 a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64
 n = 2
 validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+arm-aarch64-separate
 
 aarch64 = rshrn
 link-aarch64 = rshrn._EXT2_
diff --git a/crates/stdarch-gen/src/main.rs b/crates/stdarch-gen/src/main.rs
index 493eb05a24..82149064d2 100644
--- a/crates/stdarch-gen/src/main.rs
+++ b/crates/stdarch-gen/src/main.rs
@@ -448,6 +448,13 @@ enum TargetFeature {
     AES,
 }
 
+#[derive(Clone, Copy)]
+enum Fntype {
+    Normal,
+    Load,
+    Store,
+}
+
 fn type_to_global_type(t: &str) -> &str {
     match t {
         "int8x8_t" | "int8x8x2_t" | "int8x8x3_t" | "int8x8x4_t" => "i8x8",
@@ -496,35 +503,30 @@ fn type_to_global_type(t: &str) -> &str {
     }
 }
 
-fn type_to_native_type(t: &str) -> &str {
-    match t {
-        "int8x8_t" | "int8x16_t" | "i8" | "int8x8x2_t" | "int8x8x3_t" | "int8x8x4_t"
-        | "int8x16x2_t" | "int8x16x3_t" | "int8x16x4_t" => "i8",
-        "int16x4_t" | "int16x8_t" | "i16" | "int16x4x2_t" | "int16x4x3_t" | "int16x4x4_t"
-        | "int16x8x2_t" | "int16x8x3_t" | "int16x8x4_t" => "i16",
-        "int32x2_t" | "int32x4_t" | "i32" | "int32x2x2_t" | "int32x2x3_t" | "int32x2x4_t"
-        | "int32x4x2_t" | "int32x4x3_t" | "int32x4x4_t" => "i32",
-        "int64x1_t" | "int64x2_t" | "i64" | "int64x1x2_t" | "int64x1x3_t" | "int64x1x4_t"
-        | "int64x2x2_t" | "int64x2x3_t" | "int64x2x4_t" => "i64",
-        "uint8x8_t" | "uint8x16_t" | "u8" | "uint8x8x2_t" | "uint8x8x3_t" | "uint8x8x4_t"
-        | "uint8x16x2_t" | "uint8x16x3_t" | "uint8x16x4_t" => "u8",
-        "uint16x4_t" | "uint16x8_t" | "u16" | "uint16x4x2_t" | "uint16x4x3_t" | "uint16x4x4_t"
-        | "uint16x8x2_t" | "uint16x8x3_t" | "uint16x8x4_t" => "u16",
-        "uint32x2_t" | "uint32x4_t" | "u32" | "uint32x2x2_t" | "uint32x2x3_t" | "uint32x2x4_t"
-        | "uint32x4x2_t" | "uint32x4x3_t" | "uint32x4x4_t" => "u32",
-        "uint64x1_t" | "uint64x2_t" | "u64" | "uint64x1x2_t" | "uint64x1x3_t" | "uint64x1x4_t"
-        | "uint64x2x2_t" | "uint64x2x3_t" | "uint64x2x4_t" => "u64",
-        "float16x4_t" | "float16x8_t" => "f16",
-        "float32x2_t" | "float32x4_t" | "float32x2x2_t" | "float32x2x3_t" | "float32x2x4_t"
-        | "float32x4x2_t" | "float32x4x3_t" | "float32x4x4_t" => "f32",
-        "float64x1_t" | "float64x2_t" | "float64x1x2_t" | "float64x1x3_t" | "float64x1x4_t"
-        | "float64x2x2_t" | "float64x2x3_t" | "float64x2x4_t" => "f64",
-        "poly8x8_t" | "poly8x16_t" | "poly8x8x2_t" | "poly8x8x3_t" | "poly8x8x4_t"
-        | "poly8x16x2_t" | "poly8x16x3_t" | "poly8x16x4_t" => "u8",
-        "poly16x4_t" | "poly16x8_t" | "poly16x4x2_t" | "poly16x4x3_t" | "poly16x4x4_t"
-        | "poly16x8x2_t" | "poly16x8x3_t" | "poly16x8x4_t" => "u16",
-        "poly64x1_t" | "poly64x2_t" | "poly64x1x2_t" | "poly64x1x3_t" | "poly64x1x4_t"
-        | "poly64x2x2_t" | "poly64x2x3_t" | "poly64x2x4_t" => "u64",
+fn type_to_sub_type(t: &str) -> String {
+    let s: Vec<_> = t.split('x').collect();
+    match s.len() {
+        2 => String::from(t),
+        3 => format!("{}x{}_t", s[0], s[1]),
+        _ => panic!("unknown type: {}", t),
+    }
+}
+
+fn type_to_native_type(t: &str) -> String {
+    let s: Vec<_> = t.split('x').collect();
+    match s.len() {
+        1 => {
+            assert!(t.contains("*const") || t.contains("*mut"));
+            let sub: Vec<_> = t.split(' ').collect();
+            String::from(sub[1])
+        }
+        2 | 3 => match &s[0][0..3] {
+            "int" => format!("i{}", &s[0][3..]),
+            "uin" => format!("u{}", &s[0][4..]),
+            "flo" => format!("f{}", &s[0][5..]),
+            "pol" => format!("u{}", &s[0][4..]),
+            _ => panic!("unknown type: {}", t),
+        },
         _ => panic!("unknown type: {}", t),
     }
 }
@@ -563,82 +565,26 @@ fn native_type_to_long_type(t: &str) -> &str {
     }
 }
 
-fn type_to_ext(t: &str) -> &str {
-    match t {
-        "int8x8_t" => "v8i8",
-        "int8x16_t" => "v16i8",
-        "int16x4_t" => "v4i16",
-        "int16x8_t" => "v8i16",
-        "int32x2_t" => "v2i32",
-        "int32x4_t" => "v4i32",
-        "int64x1_t" => "v1i64",
-        "int64x2_t" => "v2i64",
-        "uint8x8_t" => "v8i8",
-        "uint8x16_t" => "v16i8",
-        "uint16x4_t" => "v4i16",
-        "uint16x8_t" => "v8i16",
-        "uint32x2_t" => "v2i32",
-        "uint32x4_t" => "v4i32",
-        "uint64x1_t" => "v1i64",
-        "uint64x2_t" => "v2i64",
-        "float16x4_t" => "v4f16",
-        "float16x8_t" => "v8f16",
-        "float32x2_t" => "v2f32",
-        "float32x4_t" => "v4f32",
-        "float64x1_t" => "v1f64",
-        "float64x2_t" => "v2f64",
-        "poly8x8_t" => "v8i8",
-        "poly8x16_t" => "v16i8",
-        "poly16x4_t" => "v4i16",
-        "poly16x8_t" => "v8i16",
-        "int8x8x2_t" | "int8x8x3_t" | "int8x8x4_t" => "v8i8.p0i8",
-        "int16x4x2_t" | "int16x4x3_t" | "int16x4x4_t" => "v4i16.p0i16",
-        "int32x2x2_t" | "int32x2x3_t" | "int32x2x4_t" => "v2i32.p0i32",
-        "int64x1x2_t" | "int64x1x3_t" | "int64x1x4_t" => "v1i64.p0i64",
-        "uint8x8x2_t" | "uint8x8x3_t" | "uint8x8x4_t" => "v8i8.p0i8",
-        "uint16x4x2_t" | "uint16x4x3_t" | "uint16x4x4_t" => "v4i16.p0i16",
-        "uint32x2x2_t" | "uint32x2x3_t" | "uint32x2x4_t" => "v2i32.p0i32",
-        "uint64x1x2_t" | "uint64x1x3_t" | "uint64x1x4_t" => "v1i64.p0i64",
-        "float32x2x2_t" | "float32x2x3_t" | "float32x2x4_t" => "v2f32.p0f32",
-        "float64x1x2_t" | "float64x1x3_t" | "float64x1x4_t" => "v1f64.p0f64",
-        "int8x16x2_t" | "int8x16x3_t" | "int8x16x4_t" => "v16i8.p0i8",
-        "int16x8x2_t" | "int16x8x3_t" | "int16x8x4_t" => "v8i16.p0i16",
-        "int32x4x2_t" | "int32x4x3_t" | "int32x4x4_t" => "v4i32.p0i32",
-        "int64x2x2_t" | "int64x2x3_t" | "int64x2x4_t" => "v2i64.p0i64",
-        "uint8x16x2_t" | "uint8x16x3_t" | "uint8x16x4_t" => "v16i8.p0i8",
-        "uint16x8x2_t" | "uint16x8x3_t" | "uint16x8x4_t" => "v8i16.p0i16",
-        "uint32x4x2_t" | "uint32x4x3_t" | "uint32x4x4_t" => "v4i32.p0i32",
-        "uint64x2x2_t" | "uint64x2x3_t" | "uint64x2x4_t" => "v2i64.p0i64",
-        "float32x4x2_t" | "float32x4x3_t" | "float32x4x4_t" => "v4f32.p0f32",
-        "float64x2x2_t" | "float64x2x3_t" | "float64x2x4_t" => "v2f64.p0f64",
-        "i8" => "i8",
-        "i16" => "i16",
-        "i32" => "i32",
-        "i64" => "i64",
-        "u8" => "i8",
-        "u16" => "i16",
-        "u32" => "i32",
-        "u64" => "i64",
-        "f32" => "f32",
-        "f64" => "f64",
-        "p64" => "p64",
-        "p128" => "p128",
-        "*const i8" => "i8",
-        "*const i16" => "i16",
-        "*const i32" => "i32",
-        "*const i64" => "i64",
-        "*const u8" => "i8",
-        "*const u16" => "i16",
-        "*const u32" => "i32",
-        "*const u64" => "i64",
-        "*const f32" => "f32",
-        "*const f64" => "f64",
-        /*
-        "poly64x1_t" => "i64x1",
-        "poly64x2_t" => "i64x2",
-        */
-        _ => panic!("unknown type for extension: {}", t),
+fn type_to_ext(t: &str) -> String {
+    if !t.contains('x') {
+        return t.replace("u", "i");
     }
+    let native = type_to_native_type(t);
+    let sub_ext = match type_sub_len(t) {
+        1 => String::new(),
+        _ => format!(".p0{}", native),
+    };
+    let sub_type = match &native[0..1] {
+        "i" | "f" => native,
+        "u" => native.replace("u", "i"),
+        _ => panic!("unknown type: {}", t),
+    };
+    format!(
+        "v{}{}{}",
+        &type_len(&type_to_sub_type(t)).to_string(),
+        sub_type,
+        sub_ext
+    )
 }
 
 fn type_to_half(t: &str) -> &str {
@@ -969,7 +915,7 @@ fn gen_aarch64(
     target: TargetFeature,
     fixed: &Vec<String>,
     multi_fn: &Vec<String>,
-    test_fn: &str,
+    fn_type: Fntype,
 ) -> (String, String) {
     let name = match suffix {
         Normal => format!("{}{}", current_name, type_to_suffix(in_t[1])),
@@ -1022,6 +968,7 @@ fn gen_aarch64(
     let current_aarch64 = current_aarch64.clone().unwrap();
     let mut ext_c = String::new();
     let mut ext_c_const = String::new();
+    let mut ext_c_store = String::new();
     let mut link_t: Vec<String> = vec![
         in_t[0].to_string(),
         in_t[1].to_string(),
@@ -1042,12 +989,18 @@ fn gen_aarch64(
         }
         let ext = type_to_ext(in_t[0]);
         let ext2 = type_to_ext(out_t);
+        let ext3 = type_to_ext(in_t[1]);
         let link_aarch64 = if link_aarch64.starts_with("llvm") {
-            link_aarch64.replace("_EXT_", ext).replace("_EXT2_", ext2)
+            link_aarch64
+                .replace("_EXT_", &ext)
+                .replace("_EXT2_", &ext2)
+                .replace("_EXT3_", &ext3)
         } else {
             let mut link = String::from("llvm.aarch64.neon.");
             link.push_str(&link_aarch64);
-            link.replace("_EXT_", ext).replace("_EXT2_", ext2)
+            link.replace("_EXT_", &ext)
+                .replace("_EXT2_", &ext2)
+                .replace("_EXT3_", &ext3)
         };
         ext_c = format!(
             r#"#[allow(improper_ctypes)]
@@ -1097,6 +1050,38 @@ fn gen_aarch64(
                 out_t
             );
         }
+        if matches!(fn_type, Fntype::Store) {
+            let sub = type_to_sub_type(in_t[1]);
+            let native = type_to_native_type(in_t[1]);
+            ext_c_store = format!(
+                r#"#[allow(improper_ctypes)]
+    extern "unadjusted" {{
+        #[cfg_attr(target_arch = "aarch64", link_name = "{}")]
+        fn {}({});
+    }}
+    "#,
+                link_aarch64,
+                current_fn,
+                match type_sub_len(in_t[1]) {
+                    1 => {
+                        format!("a: {}, ptr: *mut {}", sub, native,)
+                    }
+                    2 => {
+                        format!("a: {}, b: {}, ptr: *mut {}", sub, sub, native,)
+                    }
+                    3 => {
+                        format!("a: {}, b: {}, c: {}, ptr: *mut {}", sub, sub, sub, native,)
+                    }
+                    4 => {
+                        format!(
+                            "a: {}, b: {}, c: {}, d: {}, ptr: *mut {}",
+                            sub, sub, sub, sub, native,
+                        )
+                    }
+                    _ => panic!("unsupported type: {}", in_t[1]),
+                },
+            );
+        }
     };
     let const_declare = if let Some(constn) = constn {
         if constn.contains(":") {
@@ -1166,10 +1151,22 @@ fn gen_aarch64(
     } else {
         String::new()
     };
-    let trans: [&str; 2] = if link_t[3] != out_t {
-        ["transmute(", ")"]
-    } else {
-        ["", ""]
+    let fn_decl = {
+        let fn_output = if out_t == "void" {
+            String::new()
+        } else {
+            format!("-> {} ", out_t)
+        };
+        let fn_inputs = match para_num {
+            1 => format!("(a: {})", in_t[0]),
+            2 => format!("(a: {}, b: {})", in_t[0], in_t[1]),
+            3 => format!("(a: {}, b: {}, c: {})", in_t[0], in_t[1], in_t[2]),
+            _ => panic!("unsupported parameter number"),
+        };
+        format!(
+            "pub unsafe fn {}{}{} {}",
+            name, const_declare, fn_inputs, fn_output
+        )
     };
     let call = if let Some(const_aarch64) = const_aarch64 {
         match para_num {
@@ -1202,25 +1199,55 @@ fn gen_aarch64(
             ),
             _ => String::new(),
         }
+    } else if matches!(fn_type, Fntype::Store) {
+        match type_sub_len(in_t[1]) {
+            1 => format!(
+                r#"{}{{
+    {}{}(b, a)
+}}"#,
+                fn_decl, ext_c_store, current_fn,
+            ),
+            2 => format!(
+                r#"{}{{
+    {}{}(b.0, b.1, a)
+}}"#,
+                fn_decl, ext_c_store, current_fn,
+            ),
+            3 => format!(
+                r#"{}{{
+    {}{}(b.0, b.1, b.2, a)
+}}"#,
+                fn_decl, ext_c_store, current_fn,
+            ),
+            4 => format!(
+                r#"{}{{
+    {}{}(b.0, b.1, b.2, b.3, a)
+}}"#,
+                fn_decl, ext_c_store, current_fn,
+            ),
+            _ => panic!("unsupported type: {}", in_t[1]),
+        }
     } else {
+        let trans: [&str; 2] = if link_t[3] != out_t {
+            ["transmute(", ")"]
+        } else {
+            ["", ""]
+        };
         match (multi_calls.len(), para_num, fixed.len()) {
             (0, 1, 0) => format!(
-                r#"pub unsafe fn {}{}(a: {}) -> {} {{
+                r#"{}{{
     {}{}{}(a){}
 }}"#,
-                name, const_declare, in_t[0], out_t, ext_c, trans[0], current_fn, trans[1]
+                fn_decl, ext_c, trans[0], current_fn, trans[1]
             ),
             (0, 1, _) => {
                 let fixed: Vec<String> = fixed.iter().take(type_len(in_t[0])).cloned().collect();
                 format!(
-                    r#"pub unsafe fn {}{}(a: {}) -> {} {{
+                    r#"{}{{
     let b{};
     {}{}{}(a, transmute(b)){}
 }}"#,
-                    name,
-                    const_declare,
-                    in_t[0],
-                    out_t,
+                    fn_decl,
                     values(in_t[0], &fixed),
                     ext_c,
                     trans[0],
@@ -1229,34 +1256,34 @@ fn gen_aarch64(
                 )
             }
             (0, 2, _) => format!(
-                r#"pub unsafe fn {}{}(a: {}, b: {}) -> {} {{
+                r#"{}{{
     {}{}{}(a, b){}
 }}"#,
-                name, const_declare, in_t[0], in_t[1], out_t, ext_c, trans[0], current_fn, trans[1],
+                fn_decl, ext_c, trans[0], current_fn, trans[1],
             ),
             (0, 3, _) => format!(
-                r#"pub unsafe fn {}{}(a: {}, b: {}, c: {}) -> {} {{
+                r#"{}{{
     {}{}(a, b, c)
 }}"#,
-                name, const_declare, in_t[0], in_t[1], in_t[2], out_t, ext_c, current_fn,
+                fn_decl, ext_c, current_fn,
             ),
             (_, 1, _) => format!(
-                r#"pub unsafe fn {}{}(a: {}) -> {} {{
+                r#"{}{{
     {}{}
 }}"#,
-                name, const_declare, in_t[0], out_t, ext_c, multi_calls,
+                fn_decl, ext_c, multi_calls,
             ),
             (_, 2, _) => format!(
-                r#"pub unsafe fn {}{}(a: {}, b: {}) -> {} {{
+                r#"{}{{
     {}{}
 }}"#,
-                name, const_declare, in_t[0], in_t[1], out_t, ext_c, multi_calls,
+                fn_decl, ext_c, multi_calls,
             ),
             (_, 3, _) => format!(
-                r#"pub unsafe fn {}{}(a: {}, b: {}, c: {}) -> {} {{
+                r#"{}{{
     {}{}
 }}"#,
-                name, const_declare, in_t[0], in_t[1], in_t[2], out_t, ext_c, multi_calls,
+                fn_decl, ext_c, multi_calls,
             ),
             (_, _, _) => String::new(),
         }
@@ -1271,11 +1298,8 @@ fn gen_aarch64(
 "#,
         current_comment, current_target, current_aarch64, const_assert, const_legacy, call
     );
-
-    let test = if test_fn == "load_test" {
-        gen_load_test(&name, in_t, &out_t, current_tests, type_len(out_t))
-    } else {
-        gen_test(
+    let test = match fn_type {
+        Fntype::Normal => gen_test(
             &name,
             in_t,
             &out_t,
@@ -1283,10 +1307,13 @@ fn gen_aarch64(
             [type_len(in_t[0]), type_len(in_t[1]), type_len(in_t[2])],
             type_len(out_t),
             para_num,
-        )
+        ),
+        Fntype::Load => gen_load_test(&name, in_t, &out_t, current_tests, type_len(out_t)),
+        Fntype::Store => gen_store_test(&name, in_t, &out_t, current_tests, type_len(in_t[1])),
     };
     (function, test)
 }
+
 fn gen_load_test(
     name: &str,
     _in_t: &[&str; 3],
@@ -1298,7 +1325,7 @@ fn gen_load_test(
         Option<String>,
         Vec<String>,
     )],
-    len_out: usize,
+    type_len: usize,
 ) -> String {
     let mut test = format!(
         r#"
@@ -1307,10 +1334,10 @@ fn gen_load_test(
         name,
     );
     for (a, _, _, _, e) in current_tests {
-        let a: Vec<String> = a.iter().take(len_out + 1).cloned().collect();
-        let e: Vec<String> = e.iter().take(len_out).cloned().collect();
+        let a: Vec<String> = a.iter().take(type_len + 1).cloned().collect();
+        let e: Vec<String> = e.iter().take(type_len).cloned().collect();
         let mut input = String::from("[");
-        for i in 0..type_len(out_t) + 1 {
+        for i in 0..type_len + 1 {
             if i != 0 {
                 input.push_str(", ");
             }
@@ -1322,7 +1349,7 @@ fn gen_load_test(
             if i != 0 {
                 output.push_str(", ");
             }
-            let sub_len = type_len(out_t) / type_sub_len(out_t);
+            let sub_len = type_len / type_sub_len(out_t);
             if type_to_global_type(out_t) != "f64" {
                 let mut sub_output = format!("{}::new(", type_to_global_type(out_t));
                 for j in 0..sub_len {
@@ -1346,7 +1373,7 @@ fn gen_load_test(
         assert_eq!(r, e);
 "#,
             type_to_native_type(out_t),
-            type_len(out_t) + 1,
+            type_len + 1,
             input,
             type_to_global_type(out_t),
             type_sub_len(out_t),
@@ -1361,6 +1388,71 @@ fn gen_load_test(
     test
 }
 
+fn gen_store_test(
+    name: &str,
+    in_t: &[&str; 3],
+    _out_t: &str,
+    current_tests: &[(
+        Vec<String>,
+        Vec<String>,
+        Vec<String>,
+        Option<String>,
+        Vec<String>,
+    )],
+    type_len: usize,
+) -> String {
+    let mut test = format!(
+        r#"
+    #[simd_test(enable = "neon")]
+    unsafe fn test_{}() {{"#,
+        name,
+    );
+    for (a, _, _, _, e) in current_tests {
+        let a: Vec<String> = a.iter().take(type_len + 1).cloned().collect();
+        let e: Vec<String> = e.iter().take(type_len).cloned().collect();
+        let mut input = String::from("[");
+        for i in 0..type_len + 1 {
+            if i != 0 {
+                input.push_str(", ");
+            }
+            input.push_str(&a[i])
+        }
+        input.push_str("]");
+        let mut output = String::from("[");
+        for i in 0..type_len {
+            if i != 0 {
+                output.push_str(", ");
+            }
+            output.push_str(&e[i])
+        }
+        output.push_str("]");
+        let t = format!(
+            r#"
+        let a: [{}; {}] = {};
+        let e: [{}; {}] = {};
+        let mut r: [{}; {}] = [0{}; {}];
+        {}(r.as_mut_ptr(), {}(a[1..].as_ptr()));
+        assert_eq!(r, e);
+"#,
+            type_to_native_type(in_t[1]),
+            type_len + 1,
+            input,
+            type_to_native_type(in_t[1]),
+            type_len,
+            output,
+            type_to_native_type(in_t[1]),
+            type_len,
+            type_to_native_type(in_t[1]),
+            type_len,
+            name,
+            name.replace("st", "ld"),
+        );
+        test.push_str(&t);
+    }
+    test.push_str("    }\n");
+    test
+}
+
 fn gen_test(
     name: &str,
     in_t: &[&str; 3],
@@ -1492,7 +1584,8 @@ fn gen_arm(
     target: TargetFeature,
     fixed: &Vec<String>,
     multi_fn: &Vec<String>,
-    test_fn: &str,
+    fn_type: Fntype,
+    separate: bool,
 ) -> (String, String) {
     let name = match suffix {
         Normal => format!("{}{}", current_name, type_to_suffix(in_t[1])),
@@ -1609,19 +1702,37 @@ fn gen_arm(
         }
         let ext = type_to_ext(in_t[0]);
         let ext2 = type_to_ext(out_t);
+        let ext3 = type_to_ext(in_t[1]);
+        let ext3_arm = if matches!(fn_type, Fntype::Store) {
+            let s: Vec<_> = ext3.split('.').collect();
+            assert_eq!(s.len(), 2);
+            format!("{}.{}", s[1], s[0])
+        } else {
+            ext3.clone()
+        };
         let link_arm = if link_arm.starts_with("llvm") {
-            link_arm.replace("_EXT_", ext).replace("_EXT2_", ext2)
+            link_arm
+                .replace("_EXT_", &ext)
+                .replace("_EXT2_", &ext2)
+                .replace("_EXT3_", &ext3_arm)
         } else {
             let mut link = String::from("llvm.arm.neon.");
             link.push_str(&link_arm);
-            link.replace("_EXT_", ext).replace("_EXT2_", ext2)
+            link.replace("_EXT_", &ext)
+                .replace("_EXT2_", &ext2)
+                .replace("_EXT3_", &ext3_arm)
         };
         let link_aarch64 = if link_aarch64.starts_with("llvm") {
-            link_aarch64.replace("_EXT_", ext).replace("_EXT2_", ext2)
+            link_aarch64
+                .replace("_EXT_", &ext)
+                .replace("_EXT2_", &ext2)
+                .replace("_EXT3_", &ext3)
         } else {
             let mut link = String::from("llvm.aarch64.neon.");
             link.push_str(&link_aarch64);
-            link.replace("_EXT_", ext).replace("_EXT2_", ext2)
+            link.replace("_EXT_", &ext)
+                .replace("_EXT2_", &ext2)
+                .replace("_EXT3_", &ext3)
         };
         if out_t == link_arm_t[3] && out_t == link_aarch64_t[3] {
             ext_c = format!(
@@ -1715,6 +1826,30 @@ fn gen_arm(
                 link_arm_t[3]
             ));
         }
+        if matches!(fn_type, Fntype::Store) {
+            let sub_type = type_to_sub_type(in_t[1]);
+            ext_c_arm.push_str(&format!(
+                r#"#[allow(improper_ctypes)]
+    extern "unadjusted" {{
+        #[cfg_attr(target_arch = "arm", link_name = "{}")]
+        fn {}(ptr: *mut {}, {});
+    }}
+"#,
+                link_arm,
+                current_fn,
+                type_to_native_type(in_t[0]),
+                match type_sub_len(in_t[1]) {
+                    1 => format!("a: {}", sub_type),
+                    2 => format!("a: {}, b: {}", sub_type, sub_type,),
+                    3 => format!("a: {}, b: {}, c: {}", sub_type, sub_type, sub_type,),
+                    4 => format!(
+                        "a: {}, b: {}, c: {}, d: {}",
+                        sub_type, sub_type, sub_type, sub_type,
+                    ),
+                    _ => panic!("unknown type: {}", in_t[1]),
+                },
+            ));
+        }
         if const_aarch64.is_some() {
             ext_c_aarch64.push_str(&format!(
                 r#"#[allow(improper_ctypes)]
@@ -1768,6 +1903,30 @@ fn gen_arm(
                 link_aarch64_t[3]
             ));
         }
+        if matches!(fn_type, Fntype::Store) {
+            let sub_type = type_to_sub_type(in_t[1]);
+            ext_c_aarch64.push_str(&format!(
+                r#"#[allow(improper_ctypes)]
+    extern "unadjusted" {{
+        #[cfg_attr(target_arch = "aarch64", link_name = "{}")]
+        fn {}({}, ptr: *mut {});
+    }}
+"#,
+                link_aarch64,
+                current_fn,
+                match type_sub_len(in_t[1]) {
+                    1 => format!("a: {}", sub_type),
+                    2 => format!("a: {}, b: {}", sub_type, sub_type,),
+                    3 => format!("a: {}, b: {}, c: {}", sub_type, sub_type, sub_type,),
+                    4 => format!(
+                        "a: {}, b: {}, c: {}, d: {}",
+                        sub_type, sub_type, sub_type, sub_type,
+                    ),
+                    _ => panic!("unknown type: {}", in_t[1]),
+                },
+                type_to_native_type(in_t[0]),
+            ));
+        }
     };
     let const_declare = if let Some(constn) = constn {
         format!(r#"<const {}: i32>"#, constn)
@@ -1808,72 +1967,82 @@ fn gen_arm(
     } else {
         String::new()
     };
-    let trans: [&str; 2] = if out_t == link_arm_t[3] && out_t == link_aarch64_t[3] {
-        ["", ""]
-    } else {
-        ["transmute(", ")"]
+    let fn_decl = {
+        let fn_output = if out_t == "void" {
+            String::new()
+        } else {
+            format!("-> {} ", out_t)
+        };
+        let fn_inputs = match para_num {
+            1 => format!("(a: {})", in_t[0]),
+            2 => format!("(a: {}, b: {})", in_t[0], in_t[1]),
+            3 => format!("(a: {}, b: {}, c: {})", in_t[0], in_t[1], in_t[2]),
+            _ => panic!("unsupported parameter number"),
+        };
+        format!(
+            "pub unsafe fn {}{}{} {}",
+            name, const_declare, fn_inputs, fn_output
+        )
     };
     let call = match (multi_calls.len(), para_num, fixed.len()) {
         (0, 1, 0) => format!(
-            r#"pub unsafe fn {}{}(a: {}) -> {} {{
+            r#"{}{{
     {}{}(a)
 }}"#,
-            name, const_declare, in_t[0], out_t, ext_c, current_fn,
+            fn_decl, ext_c, current_fn,
         ),
         (0, 1, _) => {
             let fixed: Vec<String> = fixed.iter().take(type_len(in_t[0])).cloned().collect();
             format!(
-                r#"pub unsafe fn {}{}(a: {}) -> {} {{
+                r#"{}{{
     let b{};
     {}{}(a, transmute(b))
 }}"#,
-                name,
-                const_declare,
-                in_t[0],
-                out_t,
+                fn_decl,
                 values(in_t[0], &fixed),
                 ext_c,
                 current_fn,
             )
         }
         (0, 2, _) => format!(
-            r#"pub unsafe fn {}{}(a: {}, b: {}) -> {} {{
+            r#"{}{{
     {}{}(a, b)
 }}"#,
-            name, const_declare, in_t[0], in_t[1], out_t, ext_c, current_fn,
+            fn_decl, ext_c, current_fn,
         ),
         (0, 3, _) => format!(
-            r#"pub unsafe fn {}{}(a: {}, b: {}, c: {}) -> {} {{
+            r#"{}{{
     {}{}(a, b, c)
 }}"#,
-            name, const_declare, in_t[0], in_t[1], in_t[2], out_t, ext_c, current_fn,
+            fn_decl, ext_c, current_fn,
         ),
         (_, 1, _) => format!(
-            r#"pub unsafe fn {}{}(a: {}) -> {} {{
+            r#"{}{{
     {}{}
 }}"#,
-            name, const_declare, in_t[0], out_t, ext_c, multi_calls,
+            fn_decl, ext_c, multi_calls,
         ),
         (_, 2, _) => format!(
-            r#"pub unsafe fn {}{}(a: {}, b: {}) -> {} {{
+            r#"{}{{
     {}{}
 }}"#,
-            name, const_declare, in_t[0], in_t[1], out_t, ext_c, multi_calls,
+            fn_decl, ext_c, multi_calls,
         ),
         (_, 3, _) => format!(
-            r#"pub unsafe fn {}{}(a: {}, b: {}, c: {}) -> {} {{
+            r#"{}{{
     {}{}
 }}"#,
-            name, const_declare, in_t[0], in_t[1], in_t[2], out_t, ext_c, multi_calls,
+            fn_decl, ext_c, multi_calls,
         ),
         (_, _, _) => String::new(),
     };
+
     let call_arm = if let Some(const_arm) = const_arm {
         let cnt = if const_arm.contains(':') {
             let consts: Vec<_> = const_arm.split(':').map(|v| v.trim().to_string()).collect();
             consts[0].clone()
         } else {
-            let const_arm = const_arm.replace("ttn", type_to_native_type(in_t[1]));
+            let const_arm = const_arm.replace("ttn", &type_to_native_type(in_t[1]));
             let mut cnt = String::from(in_t[1]);
             cnt.push_str("(");
             for i in 0..type_len(in_t[1]) {
@@ -1887,57 +2056,60 @@ fn gen_arm(
         };
         match para_num {
             1 => format!(
-                r#"pub unsafe fn {}{}(a: {}) -> {} {{
+                r#"{}{{
     {}{}{}(a, {})
 }}"#,
-                name, const_declare, in_t[0], out_t, multi_calls, ext_c_arm, current_fn, cnt
+                fn_decl, multi_calls, ext_c_arm, current_fn, cnt
             ),
             2 => format!(
-                r#"pub unsafe fn {}{}(a: {}, b:{}) -> {} {{
+                r#"{}{{
     {}{}{}(a, b, {})
 }}"#,
-                name,
-                const_declare,
-                in_t[0],
-                in_t[1],
-                out_t,
-                multi_calls,
-                ext_c_arm,
-                current_fn,
-                cnt
+                fn_decl, multi_calls, ext_c_arm, current_fn, cnt
             ),
             _ => String::new(),
         }
     } else if out_t != link_arm_t[3] {
         match para_num {
             1 => format!(
-                r#"pub unsafe fn {}{}(a: {}) -> {} {{
-    {}{}{}{}(a){}
+                r#"{}{{
+    {}{}transmute({}(a))
 }}"#,
-                name,
-                const_declare,
-                in_t[0],
-                out_t,
-                multi_calls,
-                ext_c_arm,
-                trans[0],
-                current_fn,
-                trans[1]
+                fn_decl, multi_calls, ext_c_arm, current_fn,
             ),
             2 => format!(
-                r#"pub unsafe fn {}{}(a: {}, b: {}) -> {} {{
-    {}{}{}{}(transmute(a), transmute(b)){}
+                r#"{}{{
+    {}{}transmute({}(transmute(a), transmute(b)))
 }}"#,
-                name,
-                const_declare,
-                in_t[0],
-                in_t[1],
-                out_t,
-                multi_calls,
-                ext_c_arm,
-                trans[0],
-                current_fn,
-                trans[1],
+                fn_decl, multi_calls, ext_c_arm, current_fn,
+            ),
+            _ => String::new(),
+        }
+    } else if matches!(fn_type, Fntype::Store) {
+        match type_sub_len(in_t[1]) {
+            1 => format!(
+                r#"{}{{
+    {}{}{}(a, b)
+}}"#,
+                fn_decl, multi_calls, ext_c_arm, current_fn,
+            ),
+            2 => format!(
+                r#"{}{{
+    {}{}{}(a, b.0, b.1)
+}}"#,
+                fn_decl, multi_calls, ext_c_arm, current_fn,
+            ),
+            3 => format!(
+                r#"{}{{
+    {}{}{}(a, b.0, b.1, b.2)
+}}"#,
+                fn_decl, multi_calls, ext_c_arm, current_fn,
+            ),
+            4 => format!(
+                r#"{}{{
+    {}{}{}(a, b.0, b.1, b.2, b.3)
+}}"#,
+                fn_decl, multi_calls, ext_c_arm, current_fn,
             ),
             _ => String::new(),
         }
@@ -1947,74 +2119,67 @@ fn gen_arm(
     let call_aarch64 = if let Some(const_aarch64) = const_aarch64 {
         match para_num {
             1 => format!(
-                r#"pub unsafe fn {}{}(a: {}) -> {} {{
+                r#"{}{{
     {}{}{}(a, {})
 }}"#,
-                name,
-                const_declare,
-                in_t[0],
-                out_t,
-                multi_calls,
-                ext_c_aarch64,
-                current_fn,
-                const_aarch64
+                fn_decl, multi_calls, ext_c_aarch64, current_fn, const_aarch64
             ),
             2 => format!(
-                r#"pub unsafe fn {}{}(a: {}, b: {}) -> {} {{
+                r#"{}{{
     {}{}{}(a, b, {})
 }}"#,
-                name,
-                const_declare,
-                in_t[0],
-                in_t[1],
-                out_t,
-                multi_calls,
-                ext_c_aarch64,
-                current_fn,
-                const_aarch64
+                fn_decl, multi_calls, ext_c_aarch64, current_fn, const_aarch64
             ),
             _ => String::new(),
         }
     } else if out_t != link_aarch64_t[3] {
         match para_num {
             1 => format!(
-                r#"pub unsafe fn {}{}(a: {}) -> {} {{
-    {}{}{}{}(a){}
+                r#"{}{{
+    {}{}transmute({}(a))
 }}"#,
-                name,
-                const_declare,
-                in_t[0],
-                out_t,
-                multi_calls,
-                ext_c_aarch64,
-                trans[0],
-                current_fn,
-                trans[1],
+                fn_decl, multi_calls, ext_c_aarch64, current_fn,
             ),
             2 => format!(
-                r#"pub unsafe fn {}{}(a: {}, b: {}) -> {} {{
-    {}{}{}{}(a, b){}
+                r#"{}{{
+    {}{}transmute({}(a, b))
 }}"#,
-                name,
-                const_declare,
-                in_t[0],
-                in_t[1],
-                out_t,
-                multi_calls,
-                ext_c_aarch64,
-                trans[0],
-                current_fn,
-                trans[1],
+                fn_decl, multi_calls, ext_c_aarch64, current_fn,
+            ),
+            _ => String::new(),
+        }
+    } else if matches!(fn_type, Fntype::Store) {
+        match type_sub_len(in_t[1]) {
+            1 => format!(
+                r#"{}{{
+    {}{}{}(b, a)
+}}"#,
+                fn_decl, multi_calls, ext_c_aarch64, current_fn,
+            ),
+            2 => format!(
+                r#"{}{{
+    {}{}{}(b.0, b.1, a)
+}}"#,
+                fn_decl, multi_calls, ext_c_aarch64, current_fn,
+            ),
+            3 => format!(
+                r#"{}{{
+    {}{}{}(b.0, b.1, b.2, a)
+}}"#,
+                fn_decl, multi_calls, ext_c_aarch64, current_fn,
+            ),
+            4 => format!(
+                r#"{}{{
+    {}{}{}(b.0, b.1, b.2, b.3, a)
+}}"#,
+                fn_decl, multi_calls, ext_c_aarch64, current_fn,
             ),
             _ => String::new(),
         }
     } else {
         String::new()
     };
-    let function = if (const_arm.is_some() && const_aarch64.is_some())
-        || out_t != link_arm_t[3]
-        || out_t != link_aarch64_t[3]
-    {
+    let function = if separate {
         format!(
             r#"
 {}
@@ -2066,10 +2231,8 @@ fn gen_arm(
             call,
         )
     };
-    let test = if test_fn == "load_test" {
-        gen_load_test(&name, in_t, &out_t, current_tests, type_len(out_t))
-    } else {
-        gen_test(
+    let test = match fn_type {
+        Fntype::Normal => gen_test(
             &name,
             in_t,
             &out_t,
@@ -2077,9 +2240,10 @@ fn gen_arm(
             [type_len(in_t[0]), type_len(in_t[1]), type_len(in_t[2])],
             type_len(out_t),
             para_num,
-        )
+        ),
+        Fntype::Load => gen_load_test(&name, in_t, &out_t, current_tests, type_len(out_t)),
+        Fntype::Store => gen_store_test(&name, in_t, &out_t, current_tests, type_len(in_t[1])),
     };
-
     (function, test)
 }
 
@@ -2543,14 +2707,14 @@ fn get_call(
             let type1 = if types[0] == "element_t" {
                 type_to_native_type(in_t[1])
             } else {
-                &types[0]
+                String::from(&types[0])
             };
             let type2 = if types[1] == "element_t" {
                 type_to_native_type(in_t[1])
             } else {
-                &types[1]
+                String::from(&types[1])
             };
-            fn_name.push_str(&format!("::<{}, {}>", type1, type2));
+            fn_name.push_str(&format!("::<{}, {}>", &type1, &type2));
         } else {
             fn_name.push_str(&fn_format[2]);
         }
@@ -2602,7 +2766,8 @@ fn main() -> io::Result<()> {
     )> = Vec::new();
     let mut multi_fn: Vec<String> = Vec::new();
     let mut target: TargetFeature = Default;
-    let mut test_fn = "normal";
+    let mut fn_type: Fntype = Fntype::Normal;
+    let mut separate = false;
 
     //
     // THIS FILE IS GENERATED FORM neon.spec DO NOT CHANGE IT MANUALLY
@@ -2684,7 +2849,8 @@ mod test {
             n = None;
             multi_fn = Vec::new();
             target = Default;
-            test_fn = "normal";
+            fn_type = Fntype::Normal;
+            separate = false;
         } else if line.starts_with("//") {
         } else if line.starts_with("name = ") {
             current_name = Some(String::from(&line[7..]));
@@ -2741,14 +2907,12 @@ mod test {
             link_arm = Some(String::from(&line[11..]));
         } else if line.starts_with("const-arm = ") {
             const_arm = Some(String::from(&line[12..]));
-        } else if line.starts_with("test = ") {
-            test_fn = if line.contains("load_test") {
-                "load_test"
-            } else if line.contains("store_test") {
-                "store_test"
-            } else {
-                "normal"
-            }
+        } else if line.starts_with("load_fn") {
+            fn_type = Fntype::Load;
+        } else if line.starts_with("store_fn") {
+            fn_type = Fntype::Store;
+        } else if line.starts_with("arm-aarch64-separate") {
+            separate = true;
         } else if line.starts_with("target = ") {
             target = match Some(String::from(&line[9..])) {
                 Some(input) => match input.as_str() {
@@ -2795,7 +2959,11 @@ mod test {
                     panic!("Bad spec: {}", line)
                 }
                 if b.len() == 0 {
-                    para_num = 1;
+                    if matches!(fn_type, Fntype::Store) {
+                        para_num = 2;
+                    } else {
+                        para_num = 1;
+                    }
                 } else if c.len() != 0 {
                     para_num = 3;
                 }
@@ -2820,7 +2988,8 @@ mod test {
                         target,
                         &fixed,
                         &multi_fn,
-                        test_fn,
+                        fn_type,
+                        separate,
                     );
                     out_arm.push_str(&function);
                     tests_arm.push_str(&test);
@@ -2841,7 +3010,7 @@ mod test {
                         target,
                         &fixed,
                         &multi_fn,
-                        test_fn,
+                        fn_type,
                     );
                     out_aarch64.push_str(&function);
                     tests_aarch64.push_str(&test);

From 0384304d3581586eab945b10fdd0c1cc47610d03 Mon Sep 17 00:00:00 2001
From: SparrowLii <liyuan179@huawei.com>
Date: Tue, 31 Aug 2021 11:07:35 +0800
Subject: [PATCH 2/2] modify the instruction limit

---
 crates/stdarch-test/src/lib.rs | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/crates/stdarch-test/src/lib.rs b/crates/stdarch-test/src/lib.rs
index 10834c00e7..e5ccec216f 100644
--- a/crates/stdarch-test/src/lib.rs
+++ b/crates/stdarch-test/src/lib.rs
@@ -124,6 +124,9 @@ pub fn assert(shim_addr: usize, fnname: &str, expected: &str) {
                 // vfmaq_n_f32_vfma : #instructions = 26 >= 22 (limit)
                 "usad8" | "vfma" | "vfms" => 27,
                 "qadd8" | "qsub8" | "sadd8" | "sel" | "shadd8" | "shsub8" | "usub8" | "ssub8" => 29,
+                // core_arch/src/arm_shared/simd32
+                // vst1q_s64_x4_vst1 : #instructions = 40 >= 22 (limit)
+                "vst1" => 41,
 
                 // Temporary, currently the fptosi.sat and fptoui.sat LLVM
                 // intrinsics emit unnecessary code on arm. This can be