19
19
//! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate
20
20
21
21
use crate :: core_arch:: x86:: * ;
22
- use crate :: intrinsics:: simd:: { simd_fma, simd_insert, simd_neg} ;
22
+ use crate :: intrinsics:: simd:: { simd_fma, simd_insert, simd_neg, simd_shuffle } ;
23
23
use crate :: intrinsics:: { fmaf32, fmaf64} ;
24
24
25
25
#[ cfg( test) ]
@@ -119,7 +119,9 @@ pub unsafe fn _mm_fmadd_ss(a: __m128, b: __m128, c: __m128) -> __m128 {
119
119
#[ cfg_attr( test, assert_instr( vfmaddsub) ) ]
120
120
#[ stable( feature = "simd_x86" , since = "1.27.0" ) ]
121
121
pub unsafe fn _mm_fmaddsub_pd ( a : __m128d , b : __m128d , c : __m128d ) -> __m128d {
122
- vfmaddsubpd ( a, b, c)
122
+ let add = simd_fma ( a, b, c) ;
123
+ let sub = simd_fma ( a, b, simd_neg ( c) ) ;
124
+ simd_shuffle ! ( add, sub, [ 2 , 1 ] )
123
125
}
124
126
125
127
/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
@@ -132,7 +134,9 @@ pub unsafe fn _mm_fmaddsub_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
132
134
#[ cfg_attr( test, assert_instr( vfmaddsub) ) ]
133
135
#[ stable( feature = "simd_x86" , since = "1.27.0" ) ]
134
136
pub unsafe fn _mm256_fmaddsub_pd ( a : __m256d , b : __m256d , c : __m256d ) -> __m256d {
135
- vfmaddsubpd256 ( a, b, c)
137
+ let add = simd_fma ( a, b, c) ;
138
+ let sub = simd_fma ( a, b, simd_neg ( c) ) ;
139
+ simd_shuffle ! ( add, sub, [ 4 , 1 , 6 , 3 ] )
136
140
}
137
141
138
142
/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
@@ -145,7 +149,9 @@ pub unsafe fn _mm256_fmaddsub_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d
145
149
#[ cfg_attr( test, assert_instr( vfmaddsub) ) ]
146
150
#[ stable( feature = "simd_x86" , since = "1.27.0" ) ]
147
151
pub unsafe fn _mm_fmaddsub_ps ( a : __m128 , b : __m128 , c : __m128 ) -> __m128 {
148
- vfmaddsubps ( a, b, c)
152
+ let add = simd_fma ( a, b, c) ;
153
+ let sub = simd_fma ( a, b, simd_neg ( c) ) ;
154
+ simd_shuffle ! ( add, sub, [ 4 , 1 , 6 , 3 ] )
149
155
}
150
156
151
157
/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
@@ -158,7 +164,9 @@ pub unsafe fn _mm_fmaddsub_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
158
164
#[ cfg_attr( test, assert_instr( vfmaddsub) ) ]
159
165
#[ stable( feature = "simd_x86" , since = "1.27.0" ) ]
160
166
pub unsafe fn _mm256_fmaddsub_ps ( a : __m256 , b : __m256 , c : __m256 ) -> __m256 {
161
- vfmaddsubps256 ( a, b, c)
167
+ let add = simd_fma ( a, b, c) ;
168
+ let sub = simd_fma ( a, b, simd_neg ( c) ) ;
169
+ simd_shuffle ! ( add, sub, [ 8 , 1 , 10 , 3 , 12 , 5 , 14 , 7 ] )
162
170
}
163
171
164
172
/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
@@ -255,7 +263,9 @@ pub unsafe fn _mm_fmsub_ss(a: __m128, b: __m128, c: __m128) -> __m128 {
255
263
#[ cfg_attr( test, assert_instr( vfmsubadd) ) ]
256
264
#[ stable( feature = "simd_x86" , since = "1.27.0" ) ]
257
265
pub unsafe fn _mm_fmsubadd_pd ( a : __m128d , b : __m128d , c : __m128d ) -> __m128d {
258
- vfmsubaddpd ( a, b, c)
266
+ let add = simd_fma ( a, b, c) ;
267
+ let sub = simd_fma ( a, b, simd_neg ( c) ) ;
268
+ simd_shuffle ! ( add, sub, [ 0 , 3 ] )
259
269
}
260
270
261
271
/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
@@ -268,7 +278,9 @@ pub unsafe fn _mm_fmsubadd_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
268
278
#[ cfg_attr( test, assert_instr( vfmsubadd) ) ]
269
279
#[ stable( feature = "simd_x86" , since = "1.27.0" ) ]
270
280
pub unsafe fn _mm256_fmsubadd_pd ( a : __m256d , b : __m256d , c : __m256d ) -> __m256d {
271
- vfmsubaddpd256 ( a, b, c)
281
+ let add = simd_fma ( a, b, c) ;
282
+ let sub = simd_fma ( a, b, simd_neg ( c) ) ;
283
+ simd_shuffle ! ( add, sub, [ 0 , 5 , 2 , 7 ] )
272
284
}
273
285
274
286
/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
@@ -281,7 +293,9 @@ pub unsafe fn _mm256_fmsubadd_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d
281
293
#[ cfg_attr( test, assert_instr( vfmsubadd) ) ]
282
294
#[ stable( feature = "simd_x86" , since = "1.27.0" ) ]
283
295
pub unsafe fn _mm_fmsubadd_ps ( a : __m128 , b : __m128 , c : __m128 ) -> __m128 {
284
- vfmsubaddps ( a, b, c)
296
+ let add = simd_fma ( a, b, c) ;
297
+ let sub = simd_fma ( a, b, simd_neg ( c) ) ;
298
+ simd_shuffle ! ( add, sub, [ 0 , 5 , 2 , 7 ] )
285
299
}
286
300
287
301
/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
@@ -294,7 +308,9 @@ pub unsafe fn _mm_fmsubadd_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
294
308
#[ cfg_attr( test, assert_instr( vfmsubadd) ) ]
295
309
#[ stable( feature = "simd_x86" , since = "1.27.0" ) ]
296
310
pub unsafe fn _mm256_fmsubadd_ps ( a : __m256 , b : __m256 , c : __m256 ) -> __m256 {
297
- vfmsubaddps256 ( a, b, c)
311
+ let add = simd_fma ( a, b, c) ;
312
+ let sub = simd_fma ( a, b, simd_neg ( c) ) ;
313
+ simd_shuffle ! ( add, sub, [ 0 , 9 , 2 , 11 , 4 , 13 , 6 , 15 ] )
298
314
}
299
315
300
316
/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
@@ -471,26 +487,6 @@ pub unsafe fn _mm_fnmsub_ss(a: __m128, b: __m128, c: __m128) -> __m128 {
471
487
)
472
488
}
473
489
474
- #[ allow( improper_ctypes) ]
475
- extern "C" {
476
- #[ link_name = "llvm.x86.fma.vfmaddsub.pd" ]
477
- fn vfmaddsubpd ( a : __m128d , b : __m128d , c : __m128d ) -> __m128d ;
478
- #[ link_name = "llvm.x86.fma.vfmaddsub.pd.256" ]
479
- fn vfmaddsubpd256 ( a : __m256d , b : __m256d , c : __m256d ) -> __m256d ;
480
- #[ link_name = "llvm.x86.fma.vfmaddsub.ps" ]
481
- fn vfmaddsubps ( a : __m128 , b : __m128 , c : __m128 ) -> __m128 ;
482
- #[ link_name = "llvm.x86.fma.vfmaddsub.ps.256" ]
483
- fn vfmaddsubps256 ( a : __m256 , b : __m256 , c : __m256 ) -> __m256 ;
484
- #[ link_name = "llvm.x86.fma.vfmsubadd.pd" ]
485
- fn vfmsubaddpd ( a : __m128d , b : __m128d , c : __m128d ) -> __m128d ;
486
- #[ link_name = "llvm.x86.fma.vfmsubadd.pd.256" ]
487
- fn vfmsubaddpd256 ( a : __m256d , b : __m256d , c : __m256d ) -> __m256d ;
488
- #[ link_name = "llvm.x86.fma.vfmsubadd.ps" ]
489
- fn vfmsubaddps ( a : __m128 , b : __m128 , c : __m128 ) -> __m128 ;
490
- #[ link_name = "llvm.x86.fma.vfmsubadd.ps.256" ]
491
- fn vfmsubaddps256 ( a : __m256 , b : __m256 , c : __m256 ) -> __m256 ;
492
- }
493
-
494
490
#[ cfg( test) ]
495
491
mod tests {
496
492
0 commit comments