Skip to content

Commit 3f2d3b2

Browse files
GabrielMajerignzlbg
authored andcommitted
Add fma estimate which uses fmuladd
1 parent 6504cf2 commit 3f2d3b2

File tree

6 files changed

+119
-6
lines changed

6 files changed

+119
-6
lines changed

src/api.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,7 @@ macro_rules! impl_f {
184184
impl_math_float_abs!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
185185
impl_math_float_cos!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
186186
impl_math_float_fma!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
187+
impl_math_float_fmae!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
187188
impl_math_float_recpre!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
188189
impl_math_float_rsqrte!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
189190
impl_math_float_sin!([$elem_ty; $elem_n]: $tuple_id | $test_tt);

src/api/math/float.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,9 @@ mod rsqrte;
1818
#[macro_use]
1919
mod fma;
2020

21+
#[macro_use]
22+
mod fmae;
23+
2124
#[macro_use]
2225
mod sin;
2326

src/api/math/float/fma.rs

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,6 @@ macro_rules! impl_math_float_fma {
44
([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {
55
impl $id {
66
/// Fused multiply add: `self * y + z`
7-
///
8-
/// Most architectures which have support for FMA
9-
/// also have an equivalent version of this function,
10-
/// fused multiply subtract (`self * y - z`).
11-
/// Simply negating the second parameter of this function
12-
/// will make the compiler generate it.
137
#[inline]
148
pub fn fma(self, y: Self, z: Self) -> Self {
159
use crate::codegen::math::float::fma::Fma;

src/api/math/float/fmae.rs

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
//! Implements vertical (lane-wise) floating-point `fmae`.
2+
3+
macro_rules! impl_math_float_fmae {
4+
([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {
5+
impl $id {
6+
/// Fused multiply add estimate: ~= `self * y + z`
7+
///
8+
/// While fused multiply-add (`fma`) has infinite precision,
9+
/// `fmae` has at worst the same precision of a multiply followed by an add.
10+
/// This might be more efficient on architectures that do not have an `fma` instruction.
11+
#[inline]
12+
pub fn fmae(self, y: Self, z: Self) -> Self {
13+
use crate::codegen::math::float::fmae::Fmae;
14+
Fmae::fmae(self, y, z)
15+
}
16+
}
17+
18+
test_if!{
19+
$test_tt:
20+
interpolate_idents! {
21+
pub mod [$id _math_fmae] {
22+
use super::*;
23+
#[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
24+
fn fmae() {
25+
let z = $id::splat(0 as $elem_ty);
26+
let o = $id::splat(1 as $elem_ty);
27+
let t = $id::splat(2 as $elem_ty);
28+
let t3 = $id::splat(3 as $elem_ty);
29+
let f = $id::splat(4 as $elem_ty);
30+
31+
assert_eq!(z, z.fmae(z, z));
32+
assert_eq!(o, o.fmae(o, z));
33+
assert_eq!(o, o.fmae(z, o));
34+
assert_eq!(o, z.fmae(o, o));
35+
36+
assert_eq!(t, o.fmae(o, o));
37+
assert_eq!(t, o.fmae(t, z));
38+
assert_eq!(t, t.fmae(o, z));
39+
40+
assert_eq!(f, t.fmae(t, z));
41+
assert_eq!(f, t.fmae(o, t));
42+
assert_eq!(t3, t.fmae(o, o));
43+
}
44+
}
45+
}
46+
}
47+
};
48+
}

src/codegen/math/float.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ crate mod abs;
55
crate mod cos;
66
crate mod cos_pi;
77
crate mod fma;
8+
crate mod fmae;
89
crate mod sin;
910
crate mod sin_cos_pi;
1011
crate mod sin_pi;

src/codegen/math/float/fmae.rs

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
//! Approximation for floating-point `fma`
2+
3+
#![allow(unused)]
4+
5+
use crate::*;
6+
7+
crate trait Fmae {
8+
fn fmae(self, y: Self, z: Self) -> Self;
9+
}
10+
11+
#[cfg(not(target_arch = "s390x"))]
12+
#[allow(improper_ctypes)]
13+
extern "C" {
14+
#[link_name = "llvm.fmuladd.v2f32"]
15+
fn fmuladd_v2f32(x: f32x2, y: f32x2, z: f32x2) -> f32x2;
16+
#[link_name = "llvm.fmuladd.v4f32"]
17+
fn fmuladd_v4f32(x: f32x4, y: f32x4, z: f32x4) -> f32x4;
18+
#[link_name = "llvm.fmuladd.v8f32"]
19+
fn fmuladd_v8f32(x: f32x8, y: f32x8, z: f32x8) -> f32x8;
20+
#[link_name = "llvm.fmuladd.v16f32"]
21+
fn fmuladd_v16f32(x: f32x16, y: f32x16, z: f32x16) -> f32x16;
22+
/* FIXME 64-bit single elem vectors
23+
#[link_name = "llvm.fmuladd.v1f64"]
24+
fn fmuladd_v1f64(x: f64x1, y: f64x1, z: f64x1) -> f64x1;
25+
*/
26+
#[link_name = "llvm.fmuladd.v2f64"]
27+
fn fmuladd_v2f64(x: f64x2, y: f64x2, z: f64x2) -> f64x2;
28+
#[link_name = "llvm.fmuladd.v4f64"]
29+
fn fmuladd_v4f64(x: f64x4, y: f64x4, z: f64x4) -> f64x4;
30+
#[link_name = "llvm.fmuladd.v8f64"]
31+
fn fmuladd_v8f64(x: f64x8, y: f64x8, z: f64x8) -> f64x8;
32+
}
33+
34+
macro_rules! impl_fma {
35+
($id:ident : $fn:ident) => {
36+
impl Fmae for $id {
37+
#[inline]
38+
fn fmae(self, y: Self, z: Self) -> Self {
39+
#[cfg(not(target_arch = "s390x"))]
40+
{
41+
unsafe {
42+
mem::transmute($fn(
43+
mem::transmute(self),
44+
mem::transmute(y),
45+
mem::transmute(z),
46+
))
47+
}
48+
}
49+
#[cfg(target_arch = "s390x")]
50+
{
51+
// FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14
52+
self * y + z
53+
}
54+
}
55+
}
56+
};
57+
}
58+
59+
impl_fma!(f32x2: fmuladd_v2f32);
60+
impl_fma!(f32x4: fmuladd_v4f32);
61+
impl_fma!(f32x8: fmuladd_v8f32);
62+
impl_fma!(f32x16: fmuladd_v16f32);
63+
// impl_fma!(f64x1: fma_v1f64); // FIXME 64-bit fmagle elem vectors
64+
impl_fma!(f64x2: fmuladd_v2f64);
65+
impl_fma!(f64x4: fmuladd_v4f64);
66+
impl_fma!(f64x8: fmuladd_v8f64);

0 commit comments

Comments
 (0)