Add fma estimate which uses fmuladd

GabrielMajeri · gnzlbg · commit 3f2d3b2dc597 · 2018-09-03T15:56:41.000+02:00
diff --git a/src/api.rs b/src/api.rs
@@ -184,6 +184,7 @@ macro_rules! impl_f {
         impl_math_float_abs!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
         impl_math_float_cos!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
         impl_math_float_fma!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
+        impl_math_float_fmae!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
         impl_math_float_recpre!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
         impl_math_float_rsqrte!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
         impl_math_float_sin!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
diff --git a/src/api/math/float.rs b/src/api/math/float.rs
@@ -18,6 +18,9 @@ mod rsqrte;
 #[macro_use]
 mod fma;
 
+#[macro_use]
+mod fmae;
+
 #[macro_use]
 mod sin;
 
diff --git a/src/api/math/float/fma.rs b/src/api/math/float/fma.rs
@@ -4,12 +4,6 @@ macro_rules! impl_math_float_fma {
     ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {
         impl $id {
             /// Fused multiply add: `self * y + z`
-            ///
-            /// Most architectures which have support for FMA
-            /// also have an equivalent version of this function,
-            /// fused multiply subtract (`self * y - z`).
-            /// Simply negating the second parameter of this function
-            /// will make the compiler generate it.
             #[inline]
             pub fn fma(self, y: Self, z: Self) -> Self {
                 use crate::codegen::math::float::fma::Fma;
diff --git a/src/api/math/float/fmae.rs b/src/api/math/float/fmae.rs
@@ -0,0 +1,48 @@
+//! Implements vertical (lane-wise) floating-point `fmae`.
+
+macro_rules! impl_math_float_fmae {
+    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {
+        impl $id {
+            /// Fused multiply add estimate: ~= `self * y + z`
+            ///
+            /// While fused multiply-add (`fma`) has infinite precision,
+            /// `fmae` has at worst the same precision of a multiply followed by an add.
+            /// This might be more efficient on architectures that do not have an `fma` instruction.
+            #[inline]
+            pub fn fmae(self, y: Self, z: Self) -> Self {
+                use crate::codegen::math::float::fmae::Fmae;
+                Fmae::fmae(self, y, z)
+            }
+        }
+
+        test_if!{
+            $test_tt:
+            interpolate_idents! {
+                pub mod [$id _math_fmae] {
+                    use super::*;
+                    #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
+                    fn fmae() {
+                        let z = $id::splat(0 as $elem_ty);
+                        let o = $id::splat(1 as $elem_ty);
+                        let t = $id::splat(2 as $elem_ty);
+                        let t3 = $id::splat(3 as $elem_ty);
+                        let f = $id::splat(4 as $elem_ty);
+
+                        assert_eq!(z, z.fmae(z, z));
+                        assert_eq!(o, o.fmae(o, z));
+                        assert_eq!(o, o.fmae(z, o));
+                        assert_eq!(o, z.fmae(o, o));
+
+                        assert_eq!(t, o.fmae(o, o));
+                        assert_eq!(t, o.fmae(t, z));
+                        assert_eq!(t, t.fmae(o, z));
+
+                        assert_eq!(f, t.fmae(t, z));
+                        assert_eq!(f, t.fmae(o, t));
+                        assert_eq!(t3, t.fmae(o, o));
+                    }
+                }
+            }
+        }
+    };
+}
diff --git a/src/codegen/math/float.rs b/src/codegen/math/float.rs
@@ -5,6 +5,7 @@ crate mod abs;
 crate mod cos;
 crate mod cos_pi;
 crate mod fma;
+crate mod fmae;
 crate mod sin;
 crate mod sin_cos_pi;
 crate mod sin_pi;
diff --git a/src/codegen/math/float/fmae.rs b/src/codegen/math/float/fmae.rs
@@ -0,0 +1,66 @@
+//! Approximation for floating-point `fma`
+
+#![allow(unused)]
+
+use crate::*;
+
+crate trait Fmae {
+    fn fmae(self, y: Self, z: Self) -> Self;
+}
+
+#[cfg(not(target_arch = "s390x"))]
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.fmuladd.v2f32"]
+    fn fmuladd_v2f32(x: f32x2, y: f32x2, z: f32x2) -> f32x2;
+    #[link_name = "llvm.fmuladd.v4f32"]
+    fn fmuladd_v4f32(x: f32x4, y: f32x4, z: f32x4) -> f32x4;
+    #[link_name = "llvm.fmuladd.v8f32"]
+    fn fmuladd_v8f32(x: f32x8, y: f32x8, z: f32x8) -> f32x8;
+    #[link_name = "llvm.fmuladd.v16f32"]
+    fn fmuladd_v16f32(x: f32x16, y: f32x16, z: f32x16) -> f32x16;
+    /* FIXME 64-bit single elem vectors
+    #[link_name = "llvm.fmuladd.v1f64"]
+    fn fmuladd_v1f64(x: f64x1, y: f64x1, z: f64x1) -> f64x1;
+    */
+    #[link_name = "llvm.fmuladd.v2f64"]
+    fn fmuladd_v2f64(x: f64x2, y: f64x2, z: f64x2) -> f64x2;
+    #[link_name = "llvm.fmuladd.v4f64"]
+    fn fmuladd_v4f64(x: f64x4, y: f64x4, z: f64x4) -> f64x4;
+    #[link_name = "llvm.fmuladd.v8f64"]
+    fn fmuladd_v8f64(x: f64x8, y: f64x8, z: f64x8) -> f64x8;
+}
+
+macro_rules! impl_fma {
+    ($id:ident : $fn:ident) => {
+        impl Fmae for $id {
+            #[inline]
+            fn fmae(self, y: Self, z: Self) -> Self {
+                #[cfg(not(target_arch = "s390x"))]
+                {
+                    unsafe {
+                        mem::transmute($fn(
+                            mem::transmute(self),
+                            mem::transmute(y),
+                            mem::transmute(z),
+                        ))
+                    }
+                }
+                #[cfg(target_arch = "s390x")]
+                {
+                    // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14
+                    self * y + z
+                }
+            }
+        }
+    };
+}
+
+impl_fma!(f32x2: fmuladd_v2f32);
+impl_fma!(f32x4: fmuladd_v4f32);
+impl_fma!(f32x8: fmuladd_v8f32);
+impl_fma!(f32x16: fmuladd_v16f32);
+// impl_fma!(f64x1: fma_v1f64); // FIXME 64-bit fmagle elem vectors
+impl_fma!(f64x2: fmuladd_v2f64);
+impl_fma!(f64x4: fmuladd_v4f64);
+impl_fma!(f64x8: fmuladd_v8f64);