ARM DSP: add Quad/Double add/sub with exchange and select bytes intrinsics (#532)

paoloteti · alexcrichton · commit e6ff0972f09c · 2018-07-21T10:51:18.000-05:00
- Quad 8-bit addition/subtraction
- Double 8-bit addition/subtraction
- Saturating Add and Subtract with Exchange and
  Saturating Subtract and Add with Exchange, signed
- Select bytes based on GE bits

This patch bump the `assert_instr` limit to 22 (from 20) instead of
add a lots of exception for all DSP intrinsics.
diff --git a/coresimd/arm/dsp.rs b/coresimd/arm/dsp.rs
@@ -14,24 +14,48 @@ types! {
     pub struct uint16x2_t(u16, u16);
 }
 
+macro_rules! dsp_call {
+    ($name:expr, $a:expr, $b:expr) => {
+        ::mem::transmute($name(::mem::transmute($a), ::mem::transmute($b)))
+    };
+}
+
 extern "C" {
-    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.qadd")]
+    #[link_name = "llvm.arm.qadd"]
     fn arm_qadd(a: i32, b: i32) -> i32;
 
-    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.qsub")]
-    fn arm_qsub(a: i32, b: i32) -> i32;
+    #[link_name = "llvm.arm.qadd16"]
+    fn arm_qadd16(a: i32, b: i32) -> i32;
 
-    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.qadd8")]
+    #[link_name = "llvm.arm.qadd8"]
     fn arm_qadd8(a: i32, b: i32) -> i32;
 
-    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.qsub8")]
-    fn arm_qsub8(a: i32, b: i32) -> i32;
+    #[link_name = "llvm.arm.qasx"]
+    fn arm_qasx(a: i32, b: i32) -> i32;
 
-    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.qadd16")]
-    fn arm_qadd16(a: i32, b: i32) -> i32;
+    #[link_name = "llvm.arm.qsax"]
+    fn arm_qsax(a: i32, b: i32) -> i32;
 
-    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.qsub16")]
+    #[link_name = "llvm.arm.qsub"]
+    fn arm_qsub(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.qsub8"]
+    fn arm_qsub8(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.qsub16"]
     fn arm_qsub16(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.sadd16"]
+    fn arm_sadd16(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.sadd8"]
+    fn arm_sadd8(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.sasx"]
+    fn arm_sasx(a: i32, b: i32) -> i32;
+
+    #[cfg_attr(not(target_feature = "mclass"), link_name = "llvm.arm.sel")]
+    fn arm_sel(a: i32, b: i32) -> i32;
 }
 
 /// Signed saturating addition
@@ -63,7 +87,7 @@ pub unsafe fn qsub(a: i32, b: i32) -> i32 {
 #[inline]
 #[cfg_attr(test, assert_instr(qadd8))]
 pub unsafe fn qadd8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
-    ::mem::transmute(arm_qadd8(::mem::transmute(a), ::mem::transmute(b)))
+    dsp_call!(arm_qadd8, a, b)
 }
 
 /// Saturating two 8-bit integer subtraction
@@ -77,7 +101,7 @@ pub unsafe fn qadd8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
 #[inline]
 #[cfg_attr(test, assert_instr(qsub8))]
 pub unsafe fn qsub8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
-    ::mem::transmute(arm_qsub8(::mem::transmute(a), ::mem::transmute(b)))
+    dsp_call!(arm_qsub8, a, b)
 }
 
 /// Saturating two 16-bit integer subtraction
@@ -89,7 +113,7 @@ pub unsafe fn qsub8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
 #[inline]
 #[cfg_attr(test, assert_instr(qsub16))]
 pub unsafe fn qsub16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
-    ::mem::transmute(arm_qsub16(::mem::transmute(a), ::mem::transmute(b)))
+    dsp_call!(arm_qsub16, a, b)
 }
 
 /// Saturating two 16-bit integer additions
@@ -101,7 +125,80 @@ pub unsafe fn qsub16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
 #[inline]
 #[cfg_attr(test, assert_instr(qadd16))]
 pub unsafe fn qadd16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
-    ::mem::transmute(arm_qadd16(::mem::transmute(a), ::mem::transmute(b)))
+    dsp_call!(arm_qadd16, a, b)
+}
+
+/// Returns the 16-bit signed saturated equivalent of
+///
+/// res[0] = a[0] - b[1]
+/// res[1] = a[1] + b[0]
+#[inline]
+#[cfg_attr(test, assert_instr(qasx))]
+pub unsafe fn qasx(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+    dsp_call!(arm_qasx, a, b)
+}
+
+/// Returns the 16-bit signed saturated equivalent of
+///
+/// res[0] = a[0] + b[1]
+/// res[1] = a[1] - b[0]
+#[inline]
+#[cfg_attr(test, assert_instr(qsax))]
+pub unsafe fn qsax(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+    dsp_call!(arm_qsax, a, b)
+}
+
+/// Returns the 16-bit signed saturated equivalent of
+///
+/// res[0] = a[0] + b[1]
+/// res[1] = a[1] + b[0]
+///
+/// and the GE bits of the APSR are set.
+#[inline]
+#[cfg_attr(test, assert_instr(sadd16))]
+pub unsafe fn sadd16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+    dsp_call!(arm_sadd16, a, b)
+}
+
+/// Returns the 8-bit signed saturated equivalent of
+///
+/// res[0] = a[0] + b[1]
+/// res[1] = a[1] + b[0]
+/// res[2] = a[2] + b[2]
+/// res[3] = a[3] + b[3]
+///
+/// and the GE bits of the APSR are set.
+#[inline]
+#[cfg_attr(test, assert_instr(sadd8))]
+pub unsafe fn sadd8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
+    dsp_call!(arm_sadd8, a, b)
+}
+
+/// Returns the 16-bit signed equivalent of
+///
+/// res[0] = a[0] - b[1]
+/// res[1] = a[1] + b[0]
+///
+/// and the GE bits of the APSR are set.
+#[inline]
+#[cfg_attr(test, assert_instr(sasx))]
+pub unsafe fn sasx(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+    dsp_call!(arm_sasx, a, b)
+}
+
+/// Returns the equivalent of
+///
+/// res[0] = GE[0] ? a[0] : b[0]
+/// res[1] = GE[1] ? a[1] : b[1]
+/// res[2] = GE[2] ? a[2] : b[2]
+/// res[3] = GE[3] ? a[3] : b[3]
+///
+/// where GE are bits of APSR
+#[inline]
+#[cfg_attr(test, assert_instr(sel))]
+#[cfg(all(not(target_feature = "mclass")))]
+pub unsafe fn sel(a: int8x4_t, b: int8x4_t) -> int8x4_t {
+    dsp_call!(arm_sel, a, b)
 }
 
 #[cfg(test)]
@@ -135,7 +232,7 @@ mod tests {
             let a = i8x4::new(1, 2, 3, ::std::i8::MAX);
             let b = i8x4::new(2, -1, 0, 1);
             let c = i8x4::new(3, 1, 3, ::std::i8::MAX);
-            let r: i8x4 = ::mem::transmute(dsp::qadd8(::mem::transmute(a), ::mem::transmute(b)));
+            let r: i8x4 = dsp_call!(dsp::qadd8, a, b);
             assert_eq!(r, c);
         }
     }
@@ -146,7 +243,7 @@ mod tests {
             let a = i8x4::new(1, 2, 3, ::std::i8::MIN);
             let b = i8x4::new(2, -1, 0, 1);
             let c = i8x4::new(-1, 3, 3, ::std::i8::MIN);
-            let r: i8x4 = ::mem::transmute(dsp::qsub8(::mem::transmute(a),::mem::transmute(b)));
+            let r: i8x4 = dsp_call!(dsp::qsub8, a, b);
             assert_eq!(r, c);
         }
     }
@@ -157,7 +254,7 @@ mod tests {
             let a = i16x2::new(1, 2);
             let b = i16x2::new(2, -1);
             let c = i16x2::new(3, 1);
-            let r: i16x2 = ::mem::transmute(dsp::qadd16(::mem::transmute(a),::mem::transmute(b)));
+            let r: i16x2 = dsp_call!(dsp::qadd16, a, b);
             assert_eq!(r, c);
         }
     }
@@ -168,7 +265,75 @@ mod tests {
             let a = i16x2::new(10, 20);
             let b = i16x2::new(20, -10);
             let c = i16x2::new(-10, 30);
-            let r: i16x2 = ::mem::transmute(dsp::qsub16(::mem::transmute(a), ::mem::transmute(b)));
+            let r: i16x2 = dsp_call!(dsp::qsub16, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn qasx() {
+        unsafe {
+            let a = i16x2::new(1, ::std::i16::MAX);
+            let b = i16x2::new(2, 2);
+            let c = i16x2::new(-1, ::std::i16::MAX);
+            let r: i16x2 = dsp_call!(dsp::qasx, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn qsax() {
+        unsafe {
+            let a = i16x2::new(1, ::std::i16::MAX);
+            let b = i16x2::new(2, 2);
+            let c = i16x2::new(3, ::std::i16::MAX - 2);
+            let r: i16x2 = dsp_call!(dsp::qsax, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn sadd16() {
+        unsafe {
+            let a = i16x2::new(1, ::std::i16::MAX);
+            let b = i16x2::new(2, 2);
+            let c = i16x2::new(3, -::std::i16::MAX);
+            let r: i16x2 = dsp_call!(dsp::sadd16, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn sadd8() {
+        unsafe {
+            let a = i8x4::new(1, 2, 3, ::std::i8::MAX);
+            let b = i8x4::new(4, 3, 2, 2);
+            let c = i8x4::new(5, 5, 5, -::std::i8::MAX);
+            let r: i8x4 = dsp_call!(dsp::sadd8, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn sasx() {
+        unsafe {
+            let a = i16x2::new(1, 2);
+            let b = i16x2::new(2, 1);
+            let c = i16x2::new(0, 4);
+            let r: i16x2 = dsp_call!(dsp::sasx, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn sel() {
+        unsafe {
+            let a = i8x4::new(1, 2, 3, ::std::i8::MAX);
+            let b = i8x4::new(4, 3, 2, 2);
+            // call sadd8() to set GE bits
+            dsp::sadd8(::mem::transmute(a), ::mem::transmute(b));
+            let c = i8x4::new(1, 2, 3, ::std::i8::MAX);
+            let r: i8x4 = dsp_call!(dsp::sel, a, b);
             assert_eq!(r, c);
         }
     }
diff --git a/crates/stdsimd-test/src/lib.rs b/crates/stdsimd-test/src/lib.rs
@@ -338,7 +338,7 @@ pub fn assert(fnptr: usize, fnname: &str, expected: &str) {
 
     let instruction_limit = match expected {
         // cpuid returns a pretty big aggregate structure so excempt it from
-        // the slightly more restrictive 20 instructions below
+        // the slightly more restrictive 22 instructions below
         "cpuid" => 30,
 
         // Apparently on Windows LLVM generates a bunch of saves/restores of
@@ -351,11 +351,10 @@ pub fn assert(fnptr: usize, fnname: &str, expected: &str) {
         // cases exceed the limit.
         "cvtpi2ps" => 25,
 
-        // In this case the overall length, counting also the 'mergefunc'
-        // workaround overhead, is exactly 20 instructions.
-        "qsub8" | "qadd8" | "qsub16" | "qadd16" => 22,
-
-        _ => 20,
+        // Original limit was 20 instructions, but ARM DSP Intrinsics are
+        // exactly 20 instructions long. So bump the limit to 22 instead of
+        // adding here a long list of expections.
+        _ => 22,
     };
     let probably_only_one_instruction = instrs.len() < instruction_limit;