Use fmuladd for fma and document this behavior

GabrielMajeri · GabrielMajeri · commit 3d7712bd3aba · 2018-09-01T13:36:40.000+03:00
diff --git a/src/api/math/float/fma.rs b/src/api/math/float/fma.rs
@@ -5,6 +5,19 @@ macro_rules! impl_math_float_fma {
         impl $id {
             /// Fused multiply add: `self * y + z`
             ///
+            /// On some architectures, it is possible to combine a multiply
+            /// followed by an addition in a single instruction.
+            /// Besides performance, this may also offer better precision
+            /// than performing the operations individually.
+            ///
+            /// Note that using this function does **not** guarantee that a FMA
+            /// instruction will be emitted;
+            /// the architecture may not support it, or the compiler may decide
+            /// it's more efficient in a specific case not to use it.
+            ///
+            /// Use your architecture's intrinsic if you absolutely require
+            /// the extended precision in all circumstances.
+            ///
             /// Most architectures which have support for FMA
             /// also have an equivalent version of this function,
             /// fused multiply subtract (`self * y - z`).
diff --git a/src/codegen/math/float/fma.rs b/src/codegen/math/float/fma.rs
@@ -11,23 +11,25 @@ crate trait Fma {
 #[cfg(not(target_arch = "s390x"))]
 #[allow(improper_ctypes)]
 extern "C" {
-    #[link_name = "llvm.fma.v2f32"]
+    // We use the `fmuladd` intrinsic instead of `fma` to allow LLVM to decide
+    // on a per-case basis whether it's better to use FMA or not.
+    #[link_name = "llvm.fmuladd.v2f32"]
     fn fma_v2f32(x: f32x2, y: f32x2, z: f32x2) -> f32x2;
-    #[link_name = "llvm.fma.v4f32"]
+    #[link_name = "llvm.fmuladd.v4f32"]
     fn fma_v4f32(x: f32x4, y: f32x4, z: f32x4) -> f32x4;
-    #[link_name = "llvm.fma.v8f32"]
+    #[link_name = "llvm.fmuladd.v8f32"]
     fn fma_v8f32(x: f32x8, y: f32x8, z: f32x8) -> f32x8;
-    #[link_name = "llvm.fma.v16f32"]
+    #[link_name = "llvm.fmuladd.v16f32"]
     fn fma_v16f32(x: f32x16, y: f32x16, z: f32x16) -> f32x16;
     /* FIXME 64-bit single elem vectors
-    #[link_name = "llvm.fma.v1f64"]
+    #[link_name = "llvm.fmuladd.v1f64"]
     fn fma_v1f64(x: f64x1, y: f64x1, z: f64x1) -> f64x1;
     */
-    #[link_name = "llvm.fma.v2f64"]
+    #[link_name = "llvm.fmuladd.v2f64"]
     fn fma_v2f64(x: f64x2, y: f64x2, z: f64x2) -> f64x2;
-    #[link_name = "llvm.fma.v4f64"]
+    #[link_name = "llvm.fmuladd.v4f64"]
     fn fma_v4f64(x: f64x4, y: f64x4, z: f64x4) -> f64x4;
-    #[link_name = "llvm.fma.v8f64"]
+    #[link_name = "llvm.fmuladd.v8f64"]
     fn fma_v8f64(x: f64x8, y: f64x8, z: f64x8) -> f64x8;
 }