diff --git a/ci/run.sh b/ci/run.sh index ffa36bdcec..1fa475f27f 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -5,6 +5,8 @@ set -ex # Tests are all super fast anyway, and they fault often enough on travis that # having only one thread increases debuggability to be worth it. export RUST_TEST_THREADS=1 +export RUST_BACKTRACE=1 +export RUST_TEST_NOCAPTURE=1 # FIXME(rust-lang-nursery/stdsimd#120) run-time feature detection for ARM Neon case ${TARGET} in diff --git a/coresimd/aarch64/neon.rs b/coresimd/aarch64/neon.rs index 60929f8fe9..30adbf309b 100644 --- a/coresimd/aarch64/neon.rs +++ b/coresimd/aarch64/neon.rs @@ -5,7 +5,7 @@ #[cfg(test)] use stdsimd_test::assert_instr; use coresimd::simd_llvm::simd_add; -use coresimd::v128::f64x2; +use coresimd::simd::*; /// Vector add. #[inline] diff --git a/coresimd/arm/neon.rs b/coresimd/arm/neon.rs index 4f583728f5..70f39fc391 100644 --- a/coresimd/arm/neon.rs +++ b/coresimd/arm/neon.rs @@ -4,8 +4,8 @@ use stdsimd_test::assert_instr; use coresimd::simd_llvm::simd_add; -use coresimd::v64::*; -use coresimd::v128::*; +use coresimd::simd::*; +use convert::From; /// Vector add. #[inline] @@ -140,8 +140,8 @@ pub unsafe fn vaddq_f32(a: f32x4, b: f32x4) -> f32x4 { #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(saddl))] pub unsafe fn vaddl_s8(a: i8x8, b: i8x8) -> i16x8 { - let a = a.as_i16x8(); - let b = b.as_i16x8(); + let a = i16x8::from(a); + let b = i16x8::from(b); simd_add(a, b) } @@ -150,8 +150,8 @@ pub unsafe fn vaddl_s8(a: i8x8, b: i8x8) -> i16x8 { #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(saddl))] pub unsafe fn vaddl_s16(a: i16x4, b: i16x4) -> i32x4 { - let a = a.as_i32x4(); - let b = b.as_i32x4(); + let a = i32x4::from(a); + let b = i32x4::from(b); simd_add(a, b) } @@ -160,8 +160,8 @@ pub unsafe fn vaddl_s16(a: i16x4, b: i16x4) -> i32x4 { #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(saddl))] pub unsafe fn vaddl_s32(a: i32x2, b: i32x2) -> i64x2 { - let a = a.as_i64x2(); - let b = b.as_i64x2(); + let a = i64x2::from(a); + let b = i64x2::from(b); simd_add(a, b) } @@ -170,8 +170,8 @@ pub unsafe fn vaddl_s32(a: i32x2, b: i32x2) -> i64x2 { #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(uaddl))] pub unsafe fn vaddl_u8(a: u8x8, b: u8x8) -> u16x8 { - let a = a.as_u16x8(); - let b = b.as_u16x8(); + let a = u16x8::from(a); + let b = u16x8::from(b); simd_add(a, b) } @@ -180,8 +180,8 @@ pub unsafe fn vaddl_u8(a: u8x8, b: u8x8) -> u16x8 { #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(uaddl))] pub unsafe fn vaddl_u16(a: u16x4, b: u16x4) -> u32x4 { - let a = a.as_u32x4(); - let b = b.as_u32x4(); + let a = u32x4::from(a); + let b = u32x4::from(b); simd_add(a, b) } @@ -190,8 +190,8 @@ pub unsafe fn vaddl_u16(a: u16x4, b: u16x4) -> u32x4 { #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(uaddl))] pub unsafe fn vaddl_u32(a: u32x2, b: u32x2) -> u64x2 { - let a = a.as_u64x2(); - let b = b.as_u64x2(); + let a = u64x2::from(a); + let b = u64x2::from(b); simd_add(a, b) } diff --git a/coresimd/macros.rs b/coresimd/macros.rs deleted file mode 100644 index bf1bba42a9..0000000000 --- a/coresimd/macros.rs +++ /dev/null @@ -1,574 +0,0 @@ -//! Utility macros - -macro_rules! define_ty { - ($name:ident, $($elty:ident),+) => { - #[repr(simd)] - #[derive(Clone, Copy, Debug, PartialEq)] - #[allow(non_camel_case_types)] - pub struct $name($($elty),*); - } -} - -macro_rules! define_ty_doc { - ($name:ident, $($elty:ident),+ | $(#[$doc:meta])*) => { - $(#[$doc])* - #[repr(simd)] - #[derive(Clone, Copy, Debug, PartialEq)] - #[allow(non_camel_case_types)] - pub struct $name($($elty),*); - } -} - -macro_rules! define_impl { - ( - $name:ident, $elemty:ident, $nelems:expr, $boolname:ident, - $($elname:ident),+ - ) => { - impl $name { - #[inline(always)] - pub const fn new($($elname: $elemty),*) -> $name { - $name($($elname),*) - } - - #[inline(always)] - pub fn len() -> i32 { - $nelems - } - - #[inline(always)] - pub const fn splat(value: $elemty) -> $name { - $name($({ - #[allow(non_camel_case_types, dead_code)] - struct $elname; - value - }),*) - } - - #[inline(always)] - pub fn extract(self, idx: u32) -> $elemty { - assert!(idx < $nelems); - unsafe { self.extract_unchecked(idx) } - } - - #[inline(always)] - pub unsafe fn extract_unchecked(self, idx: u32) -> $elemty { - simd_extract(self, idx) - } - - #[inline(always)] - pub fn replace(self, idx: u32, val: $elemty) -> $name { - assert!(idx < $nelems); - unsafe { self.replace_unchecked(idx, val) } - } - - #[inline(always)] - pub unsafe fn replace_unchecked( - self, - idx: u32, - val: $elemty, - ) -> $name { - simd_insert(self, idx, val) - } - - #[inline(always)] - pub fn store(self, slice: &mut [$elemty], offset: usize) { - assert!(slice[offset..].len() >= $nelems); - unsafe { self.store_unchecked(slice, offset) } - } - - #[inline(always)] - pub unsafe fn store_unchecked( - self, - slice: &mut [$elemty], - offset: usize, - ) { - use mem::size_of; - use ptr; - - ptr::copy_nonoverlapping( - &self as *const $name as *const u8, - slice.get_unchecked_mut(offset) as *mut $elemty as *mut u8, - size_of::<$name>()); - } - - #[inline(always)] - pub fn load(slice: &[$elemty], offset: usize) -> $name { - assert!(slice[offset..].len() >= $nelems); - unsafe { $name::load_unchecked(slice, offset) } - } - - #[inline(always)] - pub unsafe fn load_unchecked( - slice: &[$elemty], - offset: usize, - ) -> $name { - use mem::size_of; - use ptr; - - let mut x = $name::splat(0 as $elemty); - ptr::copy_nonoverlapping( - slice.get_unchecked(offset) as *const $elemty as *const u8, - &mut x as *mut $name as *mut u8, - size_of::<$name>()); - x - } - - #[inline(always)] - pub fn eq(self, other: $name) -> $boolname { - unsafe { simd_eq(self, other) } - } - - #[inline(always)] - pub fn ne(self, other: $name) -> $boolname { - unsafe { simd_ne(self, other) } - } - - #[inline(always)] - pub fn lt(self, other: $name) -> $boolname { - unsafe { simd_lt(self, other) } - } - - #[inline(always)] - pub fn le(self, other: $name) -> $boolname { - unsafe { simd_le(self, other) } - } - - #[inline(always)] - pub fn gt(self, other: $name) -> $boolname { - unsafe { simd_gt(self, other) } - } - - #[inline(always)] - pub fn ge(self, other: $name) -> $boolname { - unsafe { simd_ge(self, other) } - } - } - } -} - -macro_rules! define_from { - ($to:ident, $($from:ident),+) => { - $( - impl From<$from> for $to { - #[inline(always)] - fn from(f: $from) -> $to { - unsafe { ::mem::transmute(f) } - } - } - )+ - } -} - -macro_rules! define_common_ops { - ($($ty:ident),+) => { - $( - impl ::ops::Add for $ty { - type Output = Self; - #[inline(always)] - fn add(self, other: Self) -> Self { - unsafe { simd_add(self, other) } - } - } - - impl ::ops::Sub for $ty { - type Output = Self; - #[inline(always)] - fn sub(self, other: Self) -> Self { - unsafe { simd_sub(self, other) } - } - } - - impl ::ops::Mul for $ty { - type Output = Self; - #[inline(always)] - fn mul(self, other: Self) -> Self { - unsafe { simd_mul(self, other) } - } - } - - impl ::ops::Div for $ty { - type Output = Self; - #[inline(always)] - fn div(self, other: Self) -> Self { - unsafe { simd_div(self, other) } - } - } - - impl ::ops::Rem for $ty { - type Output = Self; - #[inline(always)] - fn rem(self, other: Self) -> Self { - unsafe { simd_rem(self, other) } - } - } - - impl ::ops::AddAssign for $ty { - #[inline(always)] - fn add_assign(&mut self, other: Self) { - *self = *self + other; - } - } - - impl ::ops::SubAssign for $ty { - #[inline(always)] - fn sub_assign(&mut self, other: Self) { - *self = *self - other; - } - } - - impl ::ops::MulAssign for $ty { - #[inline(always)] - fn mul_assign(&mut self, other: Self) { - *self = *self * other; - } - } - - impl ::ops::DivAssign for $ty { - #[inline(always)] - fn div_assign(&mut self, other: Self) { - *self = *self / other; - } - } - - impl ::ops::RemAssign for $ty { - #[inline(always)] - fn rem_assign(&mut self, other: Self) { - *self = *self % other; - } - } - - )+ - } -} - -macro_rules! define_shifts { - ($ty:ident, $elem:ident, $($by:ident),+) => { - $( - impl ::ops::Shl<$by> for $ty { - type Output = Self; - #[inline(always)] - fn shl(self, other: $by) -> Self { - unsafe { simd_shl(self, $ty::splat(other as $elem)) } - } - } - impl ::ops::Shr<$by> for $ty { - type Output = Self; - #[inline(always)] - fn shr(self, other: $by) -> Self { - unsafe { simd_shr(self, $ty::splat(other as $elem)) } - } - } - - impl ::ops::ShlAssign<$by> for $ty { - #[inline(always)] - fn shl_assign(&mut self, other: $by) { - *self = *self << other; - } - } - impl ::ops::ShrAssign<$by> for $ty { - #[inline(always)] - fn shr_assign(&mut self, other: $by) { - *self = *self >> other; - } - } - - )+ - } -} - -macro_rules! define_float_ops { - ($($ty:ident),+) => { - $( - impl ::ops::Neg for $ty { - type Output = Self; - #[inline(always)] - fn neg(self) -> Self { - Self::splat(-1.0) * self - } - } - )+ - }; -} - -macro_rules! define_signed_integer_ops { - ($($ty:ident),+) => { - $( - impl ::ops::Neg for $ty { - type Output = Self; - #[inline(always)] - fn neg(self) -> Self { - Self::splat(-1) * self - } - } - )+ - }; -} - -macro_rules! define_integer_ops { - ($(($ty:ident, $elem:ident)),+) => { - $( - impl ::ops::Not for $ty { - type Output = Self; - #[inline(always)] - fn not(self) -> Self { - $ty::splat(!0) ^ self - } - } - - impl ::ops::BitAnd for $ty { - type Output = Self; - #[inline(always)] - fn bitand(self, other: Self) -> Self { - unsafe { simd_and(self, other) } - } - } - impl ::ops::BitOr for $ty { - type Output = Self; - #[inline(always)] - fn bitor(self, other: Self) -> Self { - unsafe { simd_or(self, other) } - } - } - impl ::ops::BitXor for $ty { - type Output = Self; - #[inline(always)] - fn bitxor(self, other: Self) -> Self { - unsafe { simd_xor(self, other) } - } - } - impl ::ops::BitAndAssign for $ty { - #[inline(always)] - fn bitand_assign(&mut self, other: Self) { - *self = *self & other; - } - } - impl ::ops::BitOrAssign for $ty { - #[inline(always)] - fn bitor_assign(&mut self, other: Self) { - *self = *self | other; - } - } - impl ::ops::BitXorAssign for $ty { - #[inline(always)] - fn bitxor_assign(&mut self, other: Self) { - *self = *self ^ other; - } - } - - define_shifts!( - $ty, $elem, - u8, u16, u32, u64, usize, - i8, i16, i32, i64, isize); - - impl ::fmt::LowerHex for $ty { - fn fmt(&self, f: &mut ::fmt::Formatter) - -> ::fmt::Result { - write!(f, "{}(", stringify!($ty))?; - let n = ::mem::size_of_val(self) - / ::mem::size_of::<$elem>(); - for i in 0..n { - if i > 0 { - write!(f, ", ")?; - } - write!(f, "{:#x}", self.extract(i as u32))?; - } - write!(f, ")") - } - } - )+ - } -} - -macro_rules! define_casts { - ($(($fromty:ident, $toty:ident, $cast:ident)),+) => { - $( - impl $fromty { - #[inline(always)] - pub fn $cast(self) -> ::coresimd::simd::$toty { - unsafe { simd_cast(self) } - } - } - )+ - } -} - -#[cfg(test)] -#[macro_export] -macro_rules! test_arithmetic_ { - ($tn:ident, $zero:expr, $one:expr, $two:expr, $four:expr) => { - { - let z = $tn::splat($zero); - let o = $tn::splat($one); - let t = $tn::splat($two); - let f = $tn::splat($four); - - // add - assert_eq!(z + z, z); - assert_eq!(o + z, o); - assert_eq!(t + z, t); - assert_eq!(t + t, f); - // sub - assert_eq!(z - z, z); - assert_eq!(o - z, o); - assert_eq!(t - z, t); - assert_eq!(f - t, t); - assert_eq!(f - o - o, t); - // mul - assert_eq!(z * z, z); - assert_eq!(z * o, z); - assert_eq!(z * t, z); - assert_eq!(o * t, t); - assert_eq!(t * t, f); - // div - assert_eq!(z / o, z); - assert_eq!(t / o, t); - assert_eq!(f / o, f); - assert_eq!(t / t, o); - assert_eq!(f / t, t); - // rem - assert_eq!(o % o, z); - assert_eq!(f % t, z); - - { - let mut v = z; - assert_eq!(v, z); - v += o; // add_assign - assert_eq!(v, o); - v -= o; // sub_assign - assert_eq!(v, z); - v = t; - v *= o; // mul_assign - assert_eq!(v, t); - v *= t; - assert_eq!(v, f); - v /= o; // div_assign - assert_eq!(v, f); - v /= t; - assert_eq!(v, t); - v %= t; // rem_assign - assert_eq!(v, z); - } - } - }; - } - -#[cfg(test)] -#[macro_export] -macro_rules! test_neg_ { - ($tn:ident, $zero:expr, $one:expr, $two:expr, $four:expr) => { - { - let z = $tn::splat($zero); - let o = $tn::splat($one); - let t = $tn::splat($two); - let f = $tn::splat($four); - - let nz = $tn::splat(-$zero); - let no = $tn::splat(-$one); - let nt = $tn::splat(-$two); - let nf = $tn::splat(-$four); - - assert_eq!(-z, nz); - assert_eq!(-o, no); - assert_eq!(-t, nt); - assert_eq!(-f, nf); - } - }; - } - -#[cfg(test)] -#[macro_export] -macro_rules! test_bit_arithmetic_ { - ($tn:ident) => { - { - let z = $tn::splat(0); - let o = $tn::splat(1); - let t = $tn::splat(2); - let f = $tn::splat(4); - let m = $tn::splat(!z.extract(0)); - - // shr - assert_eq!(o >> 1, z); - assert_eq!(t >> 1, o); - assert_eq!(f >> 1, t); - // shl - assert_eq!(o << 1, t); - assert_eq!(o << 2, f); - assert_eq!(t << 1, f); - // bitand - assert_eq!(o & o, o); - assert_eq!(t & t, t); - assert_eq!(t & o, z); - // bitor - assert_eq!(o | o, o); - assert_eq!(t | t, t); - assert_eq!(z | o, o); - // bitxor - assert_eq!(o ^ o, z); - assert_eq!(t ^ t, z); - assert_eq!(z ^ o, o); - // not - assert_eq!(!z, m); - assert_eq!(!m, z); - - { // shr_assign - let mut v = o; - v >>= 1; - assert_eq!(v, z); - } - { // shl_assign - let mut v = o; - v <<= 1; - assert_eq!(v, t); - } - { // and_assign - let mut v = o; - v &= t; - assert_eq!(v, z); - } - { // or_assign - let mut v = z; - v |= o; - assert_eq!(v, o); - } - { // xor_assign - let mut v = z; - v ^= o; - assert_eq!(v, o); - } - } - }; -} - -#[cfg(test)] -#[macro_export] -macro_rules! test_ops_si { - ($($tn:ident),+) => { - $( - test_arithmetic_!($tn, 0, 1, 2, 4); - test_neg_!($tn, 0, 1, 2, 4); - test_bit_arithmetic_!($tn); - )+ - }; - } - -#[cfg(test)] -#[macro_export] -macro_rules! test_ops_ui { - ($($tn:ident),+) => { - $( - test_arithmetic_!($tn, 0, 1, 2, 4); - test_bit_arithmetic_!($tn); - )+ - }; - } - -#[cfg(test)] -#[macro_export] -macro_rules! test_ops_f { - ($($tn:ident),+) => { - $( - test_arithmetic_!($tn, 0., 1., 2., 4.); - test_neg_!($tn, 0., 1., 2., 4.); - )+ - }; - } diff --git a/coresimd/mod.rs b/coresimd/mod.rs index 2204bce4c9..38cbe92cf9 100644 --- a/coresimd/mod.rs +++ b/coresimd/mod.rs @@ -1,14 +1,16 @@ +//! `coresimd` + +#[macro_use] +mod ppsv; + /// Platform independent SIMD vector types and operations. /// -/// This is an **unstable** module for portable SIMD operations. This module has -/// not yet gone through an RFC and is likely to change, but feedback is always -/// welcome! +/// This is an **unstable** module for portable SIMD operations. This module +/// has not yet gone through an RFC and is likely to change, but feedback is +/// always welcome! #[unstable(feature = "stdsimd", issue = "0")] pub mod simd { - pub use coresimd::v128::*; - pub use coresimd::v256::*; - pub use coresimd::v512::*; - pub use coresimd::v64::*; + pub use coresimd::ppsv::*; } /// Platform dependent vendor intrinsics. @@ -23,15 +25,15 @@ pub mod simd { /// only one platform it actually contains intrinsics for multiple platforms /// compiled in conditionally. For other platforms of stdsimd see: /// -/// * [x86] -/// * [x86_64] -/// * [arm] -/// * [aarch64] +/// * [`x86`] +/// * [`x86_64`] +/// * [`arm`] +/// * [`aarch64`] /// -/// [x86]: https://rust-lang-nursery.github.io/stdsimd/x86/stdsimd/arch/index.html -/// [x86_64]: https://rust-lang-nursery.github.io/stdsimd/x86_64/stdsimd/arch/index.html -/// [arm]: https://rust-lang-nursery.github.io/stdsimd/arm/stdsimd/arch/index.html -/// [aarch64]: https://rust-lang-nursery.github.io/stdsimd/aarch64/stdsimd/arch/index.html +/// [`x86`]: https://rust-lang-nursery.github.io/stdsimd/x86/stdsimd/arch/index.html +/// [`x86_64`]: https://rust-lang-nursery.github.io/stdsimd/x86_64/stdsimd/arch/index.html +/// [`arm`]: https://rust-lang-nursery.github.io/stdsimd/arm/stdsimd/arch/index.html +/// [`aarch64`]: https://rust-lang-nursery.github.io/stdsimd/aarch64/stdsimd/arch/index.html #[unstable(feature = "stdsimd", issue = "0")] pub mod arch { /// Platform-specific intrinsics for the `x86` platform. @@ -69,51 +71,7 @@ pub mod arch { } } -#[macro_use] -mod macros; mod simd_llvm; -mod v128; -mod v256; -mod v512; -mod v64; - -/// 32-bit wide vector tpyes -mod v32 { - #[cfg(not(test))] - use prelude::v1::*; - use coresimd::simd_llvm::*; - - define_ty! { i16x2, i16, i16 } - define_impl! { i16x2, i16, 2, i16x2, x0, x1 } - define_ty! { u16x2, u16, u16 } - define_impl! { u16x2, u16, 2, i16x2, x0, x1 } - - define_ty! { i8x4, i8, i8, i8, i8 } - define_impl! { i8x4, i8, 4, i8x4, x0, x1, x2, x3 } - define_ty! { u8x4, u8, u8, u8, u8 } - define_impl! { u8x4, u8, 4, i8x4, x0, x1, x2, x3 } - - define_casts!( - (i16x2, i64x2, as_i64x2), - (u16x2, i64x2, as_i64x2), - (i8x4, i32x4, as_i32x4), - (u8x4, i32x4, as_i32x4) - ); -} - -/// 16-bit wide vector tpyes -mod v16 { - #[cfg(not(test))] - use prelude::v1::*; - use coresimd::simd_llvm::*; - - define_ty! { i8x2, i8, i8 } - define_impl! { i8x2, i8, 2, i8x2, x0, x1 } - define_ty! { u8x2, u8, u8 } - define_impl! { u8x2, u8, 2, i8x2, x0, x1 } - - define_casts!((i8x2, i64x2, as_i64x2), (u8x2, i64x2, as_i64x2)); -} #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] mod x86; diff --git a/coresimd/ppsv/api/arithmetic_ops.rs b/coresimd/ppsv/api/arithmetic_ops.rs new file mode 100644 index 0000000000..e98745014b --- /dev/null +++ b/coresimd/ppsv/api/arithmetic_ops.rs @@ -0,0 +1,142 @@ +//! Lane-wise arithmetic operations. + +macro_rules! impl_arithmetic_ops { + ($id:ident) => { + impl ops::Add for $id { + type Output = Self; + #[inline] + fn add(self, other: Self) -> Self { + unsafe { simd_add(self, other) } + } + } + + impl ops::Sub for $id { + type Output = Self; + #[inline] + fn sub(self, other: Self) -> Self { + unsafe { simd_sub(self, other) } + } + } + + impl ops::Mul for $id { + type Output = Self; + #[inline] + fn mul(self, other: Self) -> Self { + unsafe { simd_mul(self, other) } + } + } + + impl ops::Div for $id { + type Output = Self; + #[inline] + fn div(self, other: Self) -> Self { + unsafe { simd_div(self, other) } + } + } + + impl ops::Rem for $id { + type Output = Self; + #[inline] + fn rem(self, other: Self) -> Self { + unsafe { simd_rem(self, other) } + } + } + + impl ops::AddAssign for $id { + #[inline] + fn add_assign(&mut self, other: Self) { + *self = *self + other; + } + } + + impl ops::SubAssign for $id { + #[inline] + fn sub_assign(&mut self, other: Self) { + *self = *self - other; + } + } + + impl ops::MulAssign for $id { + #[inline] + fn mul_assign(&mut self, other: Self) { + *self = *self * other; + } + } + + impl ops::DivAssign for $id { + #[inline] + fn div_assign(&mut self, other: Self) { + *self = *self / other; + } + } + + impl ops::RemAssign for $id { + #[inline] + fn rem_assign(&mut self, other: Self) { + *self = *self % other; + } + } + } +} + +#[cfg(test)] +#[macro_export] +macro_rules! test_arithmetic_ops { + ($id:ident, $elem_ty:ident) => { + #[test] + fn arithmetic() { + use ::coresimd::simd::$id; + let z = $id::splat(0 as $elem_ty); + let o = $id::splat(1 as $elem_ty); + let t = $id::splat(2 as $elem_ty); + let f = $id::splat(4 as $elem_ty); + + // add + assert_eq!(z + z, z); + assert_eq!(o + z, o); + assert_eq!(t + z, t); + assert_eq!(t + t, f); + // sub + assert_eq!(z - z, z); + assert_eq!(o - z, o); + assert_eq!(t - z, t); + assert_eq!(f - t, t); + assert_eq!(f - o - o, t); + // mul + assert_eq!(z * z, z); + assert_eq!(z * o, z); + assert_eq!(z * t, z); + assert_eq!(o * t, t); + assert_eq!(t * t, f); + // div + assert_eq!(z / o, z); + assert_eq!(t / o, t); + assert_eq!(f / o, f); + assert_eq!(t / t, o); + assert_eq!(f / t, t); + // rem + assert_eq!(o % o, z); + assert_eq!(f % t, z); + + { + let mut v = z; + assert_eq!(v, z); + v += o; // add_assign + assert_eq!(v, o); + v -= o; // sub_assign + assert_eq!(v, z); + v = t; + v *= o; // mul_assign + assert_eq!(v, t); + v *= t; + assert_eq!(v, f); + v /= o; // div_assign + assert_eq!(v, f); + v /= t; + assert_eq!(v, t); + v %= t; // rem_assign + assert_eq!(v, z); + } + } + }; +} diff --git a/coresimd/ppsv/api/arithmetic_reductions.rs b/coresimd/ppsv/api/arithmetic_reductions.rs new file mode 100644 index 0000000000..54fba4871d --- /dev/null +++ b/coresimd/ppsv/api/arithmetic_reductions.rs @@ -0,0 +1,64 @@ +//! Implements portable arithmetic vector reductions. + +macro_rules! impl_arithmetic_reductions { + ($id:ident, $elem_ty:ident) => { + impl $id { + /// Lane-wise addition of the vector elements. + #[inline] + pub fn sum(self) -> $elem_ty { + ReduceAdd::reduce_add(self) + } + /// Lane-wise multiplication of the vector elements. + #[inline] + pub fn product(self) -> $elem_ty { + ReduceMul::reduce_mul(self) + } + } + } +} + +#[cfg(test)] +macro_rules! test_arithmetic_reductions { + ($id:ident, $elem_ty:ident) => { + + fn alternating(x: usize) -> ::coresimd::simd::$id { + use ::coresimd::simd::$id; + let mut v = $id::splat(1 as $elem_ty); + for i in 0..$id::lanes() { + if i % x == 0 { + v = v.replace(i, 2 as $elem_ty); + } + } + v + } + + #[test] + fn sum() { + use ::coresimd::simd::$id; + let v = $id::splat(0 as $elem_ty); + assert_eq!(v.sum(), 0 as $elem_ty); + let v = $id::splat(1 as $elem_ty); + assert_eq!(v.sum(), $id::lanes() as $elem_ty); + let v = alternating(2); + eprintln!("{:?}", v); + assert_eq!(v.sum(), ($id::lanes() / 2 + $id::lanes()) as $elem_ty); + } + #[test] + fn product() { + use ::coresimd::simd::$id; + let v = $id::splat(0 as $elem_ty); + assert_eq!(v.product(), 0 as $elem_ty); + let v = $id::splat(1 as $elem_ty); + assert_eq!(v.product(), 1 as $elem_ty); + let f = match $id::lanes() { + 64 => 16, + 32 => 8, + 16 => 4, + _ => 2, + }; + let v = alternating(f); + eprintln!("{:?}", v); + assert_eq!(v.product(), (2_usize.pow(($id::lanes() / f) as u32) as $elem_ty)); + } + } +} diff --git a/coresimd/ppsv/api/bitwise_ops.rs b/coresimd/ppsv/api/bitwise_ops.rs new file mode 100644 index 0000000000..aa82b2e797 --- /dev/null +++ b/coresimd/ppsv/api/bitwise_ops.rs @@ -0,0 +1,171 @@ +//! Lane-wise bitwise operations for integer and boolean vectors. + +macro_rules! impl_bitwise_ops { + ($ty:ident, $true_val:expr) => { + impl ops::Not for $ty { + type Output = Self; + #[inline] + fn not(self) -> Self { + Self::splat($true_val) ^ self + } + } + impl ops::BitXor for $ty { + type Output = Self; + #[inline] + fn bitxor(self, other: Self) -> Self { + unsafe { simd_xor(self, other) } + } + } + impl ops::BitAnd for $ty { + type Output = Self; + #[inline] + fn bitand(self, other: Self) -> Self { + unsafe { simd_and(self, other) } + } + } + impl ops::BitOr for $ty { + type Output = Self; + #[inline] + fn bitor(self, other: Self) -> Self { + unsafe { simd_or(self, other) } + } + } + impl ops::BitAndAssign for $ty { + #[inline] + fn bitand_assign(&mut self, other: Self) { + *self = *self & other; + } + } + impl ops::BitOrAssign for $ty { + #[inline] + fn bitor_assign(&mut self, other: Self) { + *self = *self | other; + } + } + impl ops::BitXorAssign for $ty { + #[inline] + fn bitxor_assign(&mut self, other: Self) { + *self = *self ^ other; + } + } + }; +} + +#[cfg(test)] +#[macro_export] +macro_rules! test_int_bitwise_ops { + ($id:ident, $elem_ty:ident) => { + #[test] + fn bitwise_ops() { + use ::coresimd::simd::$id; + let z = $id::splat(0 as $elem_ty); + let o = $id::splat(1 as $elem_ty); + let t = $id::splat(2 as $elem_ty); + let m = $id::splat(!z.extract(0)); + + // Not: + assert_eq!(!z, m); + assert_eq!(!m, z); + + // BitAnd: + assert_eq!(o & o, o); + assert_eq!(o & z, z); + assert_eq!(z & o, z); + assert_eq!(z & z, z); + + assert_eq!(t & t, t); + assert_eq!(t & o, z); + assert_eq!(o & t, z); + + // BitOr: + assert_eq!(o | o, o); + assert_eq!(o | z, o); + assert_eq!(z | o, o); + assert_eq!(z | z, z); + + assert_eq!(t | t, t); + assert_eq!(z | t, t); + assert_eq!(t | z, t); + + // BitXOR: + assert_eq!(o ^ o, z); + assert_eq!(z ^ z, z); + assert_eq!(z ^ o, o); + assert_eq!(o ^ z, o); + + assert_eq!(t ^ t, z); + assert_eq!(t ^ z, t); + assert_eq!(z ^ t, t); + + { // AndAssign: + let mut v = o; + v &= t; + assert_eq!(v, z); + } + { // OrAssign: + let mut v = z; + v |= o; + assert_eq!(v, o); + } + { // XORAssign: + let mut v = z; + v ^= o; + assert_eq!(v, o); + } + } + } +} + +#[cfg(test)] +#[macro_export] +macro_rules! test_bool_bitwise_ops { + ($id:ident) => { + #[test] + fn bool_arithmetic() { + use ::coresimd::simd::*; + + let t = $id::splat(true); + let f = $id::splat(false); + assert!(t != f); + assert!(!(t == f)); + + // Not: + assert_eq!(!t, f); + assert_eq!(t, !f); + + // BitAnd: + assert_eq!(t & f, f); + assert_eq!(f & t, f); + assert_eq!(t & t, t); + assert_eq!(f & f, f); + + // BitOr: + assert_eq!(t | f, t); + assert_eq!(f | t, t); + assert_eq!(t | t, t); + assert_eq!(f | f, f); + + // BitXOR: + assert_eq!(t ^ f, t); + assert_eq!(f ^ t, t); + assert_eq!(t ^ t, f); + assert_eq!(f ^ f, f); + + { // AndAssign: + let mut v = f; + v &= t; + assert_eq!(v, f); + } + { // OrAssign: + let mut v = f; + v |= t; + assert_eq!(v, t); + } + { // XORAssign: + let mut v = f; + v ^= t; + assert_eq!(v, t); + } + } + } +} diff --git a/coresimd/ppsv/api/bitwise_reductions.rs b/coresimd/ppsv/api/bitwise_reductions.rs new file mode 100644 index 0000000000..ae598e5b98 --- /dev/null +++ b/coresimd/ppsv/api/bitwise_reductions.rs @@ -0,0 +1,97 @@ +//! Implements portable bitwise vector reductions. + +macro_rules! impl_bitwise_reductions { + ($id:ident, $elem_ty:ident) => { + impl $id { + /// Lane-wise bitwise `and` of the vector elements. + #[inline] + pub fn and(self) -> $elem_ty { + ReduceAnd::reduce_and(self) + } + /// Lane-wise bitwise `or` of the vector elements. + #[inline] + pub fn or(self) -> $elem_ty { + ReduceOr::reduce_or(self) + } + /// Lane-wise bitwise `xor` of the vector elements. + #[inline] + pub fn xor(self) -> $elem_ty { + ReduceXor::reduce_xor(self) + } + } + } +} + +macro_rules! impl_bool_bitwise_reductions { + ($id:ident, $elem_ty:ident) => { + impl $id { + /// Lane-wise bitwise `and` of the vector elements. + #[inline] + pub fn and(self) -> $elem_ty { + ReduceAnd::reduce_and(self) !=0 + } + /// Lane-wise bitwise `or` of the vector elements. + #[inline] + pub fn or(self) -> $elem_ty { + ReduceOr::reduce_or(self) != 0 + } + /// Lane-wise bitwise `xor` of the vector elements. + #[inline] + pub fn xor(self) -> $elem_ty { + ReduceXor::reduce_xor(self) != 0 + } + } + } +} + + +#[cfg(test)] +macro_rules! test_bitwise_reductions { + ($id:ident, $true:expr) => { + #[test] + fn and() { + let false_ = !$true; + use ::coresimd::simd::$id; + let v = $id::splat(false_); + assert_eq!(v.and(), false_); + let v = $id::splat($true); + assert_eq!(v.and(), $true); + let v = $id::splat(false_); + let v = v.replace(0, $true); + assert_eq!(v.and(), false_); + let v = $id::splat($true); + let v = v.replace(0, false_); + assert_eq!(v.and(), false_); + } + #[test] + fn or() { + let false_ = !$true; + use ::coresimd::simd::$id; + let v = $id::splat(false_); + assert_eq!(v.or(), false_); + let v = $id::splat($true); + assert_eq!(v.or(), $true); + let v = $id::splat(false_); + let v = v.replace(0, $true); + assert_eq!(v.or(), $true); + let v = $id::splat($true); + let v = v.replace(0, false_); + assert_eq!(v.or(), $true); + } + #[test] + fn xor() { + let false_ = !$true; + use ::coresimd::simd::$id; + let v = $id::splat(false_); + assert_eq!(v.xor(), false_); + let v = $id::splat($true); + assert_eq!(v.xor(), false_); + let v = $id::splat(false_); + let v = v.replace(0, $true); + assert_eq!(v.xor(), $true); + let v = $id::splat($true); + let v = v.replace(0, false_); + assert_eq!(v.xor(), $true); + } + } +} diff --git a/coresimd/ppsv/api/bool_vectors.rs b/coresimd/ppsv/api/bool_vectors.rs new file mode 100644 index 0000000000..01fa13f0b8 --- /dev/null +++ b/coresimd/ppsv/api/bool_vectors.rs @@ -0,0 +1,147 @@ +//! Minimal boolean vector implementation + +/// Minimal interface: all packed SIMD boolean vector types implement this. +macro_rules! impl_bool_minimal { + ($id:ident, $elem_ty:ident, $elem_count:expr, $($elem_name:ident),+) => { + + #[cfg_attr(feature = "cargo-clippy", allow(expl_impl_clone_on_copy))] + impl Clone for $id { + #[inline] // currently needed for correctness + fn clone(&self) -> Self { + *self + } + } + + impl $id { + /// Creates a new instance with each vector elements initialized + /// with the provided values. + #[inline] + pub const fn new($($elem_name: bool),*) -> Self { + $id($(Self::bool_to_internal($elem_name)),*) + } + + /// Converts a boolean type into the type of the vector lanes. + #[inline] + const fn bool_to_internal(x: bool) -> $elem_ty { + [0 as $elem_ty, !(0 as $elem_ty)][x as usize] + } + + /// Returns the number of vector lanes. + #[inline] + pub const fn lanes() -> usize { + $elem_count + } + + /// Constructs a new instance with each element initialized to + /// `value`. + #[inline] + pub const fn splat(value: bool) -> Self { + $id($({ + #[allow(non_camel_case_types, dead_code)] + struct $elem_name; + Self::bool_to_internal(value) + }),*) + } + + /// Extracts the value at `index`. + /// + /// # Panics + /// + /// If `index >= Self::lanes()`. + #[inline] + pub fn extract(self, index: usize) -> bool { + assert!(index < $elem_count); + unsafe { self.extract_unchecked(index) } + } + + /// Extracts the value at `index`. + /// + /// If `index >= Self::lanes()` the behavior is undefined. + #[inline] + pub unsafe fn extract_unchecked(self, index: usize) -> bool { + let x: $elem_ty = simd_extract(self, index as u32); + x != 0 + } + + /// Returns a new vector where the value at `index` is replaced by `new_value`. + /// + /// # Panics + /// + /// If `index >= Self::lanes()`. + #[inline] + #[must_use = "replace does not modify the original value - it returns a new vector with the value at `index` replaced by `new_value`d"] + pub fn replace(self, index: usize, new_value: bool) -> Self { + assert!(index < $elem_count); + unsafe { self.replace_unchecked(index, new_value) } + } + + /// Returns a new vector where the value at `index` is replaced by `new_value`. + /// + /// # Panics + /// + /// If `index >= Self::lanes()`. + #[inline] + #[must_use = "replace_unchecked does not modify the original value - it returns a new vector with the value at `index` replaced by `new_value`d"] + pub unsafe fn replace_unchecked( + self, + index: usize, + new_value: bool, + ) -> Self { + simd_insert(self, index as u32, Self::bool_to_internal(new_value)) + } + } + } +} + +#[cfg(test)] +macro_rules! test_bool_minimal { + ($id:ident, $elem_count:expr) => { + #[test] + fn minimal() { + use ::coresimd::simd::$id; + // TODO: test new + + // lanes: + assert_eq!($elem_count, $id::lanes()); + + // splat and extract / extract_unchecked: + let vec = $id::splat(true); + for i in 0..$id::lanes() { + assert_eq!(true, vec.extract(i)); + assert_eq!(true, unsafe { vec.extract_unchecked(i) }); + } + + // replace / replace_unchecked + let new_vec = vec.replace(1, false); + for i in 0..$id::lanes() { + if i == 1 { + assert_eq!(false, new_vec.extract(i)); + } else { + assert_eq!(true, new_vec.extract(i)); + } + } + let new_vec = unsafe { vec.replace_unchecked(1, false) }; + for i in 0..$id::lanes() { + if i == 1 { + assert_eq!(false, new_vec.extract(i)); + } else { + assert_eq!(true, new_vec.extract(i)); + } + } + } + #[test] + #[should_panic] + fn minimal_extract_panic_on_out_of_bounds() { + use ::coresimd::simd::$id; + let vec = $id::splat(false); + let _ = vec.extract($id::lanes()); + } + #[test] + #[should_panic] + fn minimal_replace_panic_on_out_of_bounds() { + use ::coresimd::simd::$id; + let vec = $id::splat(false); + let _ = vec.replace($id::lanes(), true); + } + } +} diff --git a/coresimd/ppsv/api/boolean_reductions.rs b/coresimd/ppsv/api/boolean_reductions.rs new file mode 100644 index 0000000000..3c45fee48d --- /dev/null +++ b/coresimd/ppsv/api/boolean_reductions.rs @@ -0,0 +1,83 @@ +//! Lane-wise boolean vector reductions. + +macro_rules! impl_bool_reductions { + ($id:ident) => { + impl $id { + /// Are `all` vector lanes `true`? + #[inline] + pub fn all(self) -> bool { + self.and() + } + /// Is `any` vector lanes `true`? + #[inline] + pub fn any(self) -> bool { + self.or() + } + /// Are `all` vector lanes `false`? + #[inline] + pub fn none(self) -> bool { + !self.or() + } + } + } +} + +#[cfg(test)] +macro_rules! test_bool_reductions { + ($id:ident) => { + #[test] + fn all() { + use ::coresimd::simd::$id; + + let a = $id::splat(true); + assert!(a.all()); + let a = $id::splat(false); + assert!(!a.all()); + + for i in 0..$id::lanes() { + let mut a = $id::splat(true); + a = a.replace(i, false); + assert!(!a.all()); + let mut a = $id::splat(false); + a = a.replace(i, true); + assert!(!a.all()); + } + } + #[test] + fn any() { + use ::coresimd::simd::$id; + + let a = $id::splat(true); + assert!(a.any()); + let a = $id::splat(false); + assert!(!a.any()); + + for i in 0..$id::lanes() { + let mut a = $id::splat(true); + a = a.replace(i, false); + assert!(a.any()); + let mut a = $id::splat(false); + a = a.replace(i, true); + assert!(a.any()); + } + } + #[test] + fn none() { + use ::coresimd::simd::$id; + + let a = $id::splat(true); + assert!(!a.none()); + let a = $id::splat(false); + assert!(a.none()); + + for i in 0..$id::lanes() { + let mut a = $id::splat(true); + a = a.replace(i, false); + assert!(!a.none()); + let mut a = $id::splat(false); + a = a.replace(i, true); + assert!(!a.none()); + } + } + } +} diff --git a/coresimd/ppsv/api/cmp.rs b/coresimd/ppsv/api/cmp.rs new file mode 100644 index 0000000000..5eb15d2933 --- /dev/null +++ b/coresimd/ppsv/api/cmp.rs @@ -0,0 +1,138 @@ +//! Lane-wise vector comparisons returning boolean vectors. + +macro_rules! impl_cmp { + ($id:ident, $bool_ty:ident) => { + impl $id { + /// Lane-wise equality comparison. + #[inline] + pub fn eq(self, other: $id) -> $bool_ty { + unsafe { simd_eq(self, other) } + } + + /// Lane-wise inequality comparison. + #[inline] + pub fn ne(self, other: $id) -> $bool_ty { + unsafe { simd_ne(self, other) } + } + + /// Lane-wise less-than comparison. + #[inline] + pub fn lt(self, other: $id) -> $bool_ty { + unsafe { simd_lt(self, other) } + } + + /// Lane-wise less-than-or-equals comparison. + #[inline] + pub fn le(self, other: $id) -> $bool_ty { + unsafe { simd_le(self, other) } + } + + /// Lane-wise greater-than comparison. + #[inline] + pub fn gt(self, other: $id) -> $bool_ty { + unsafe { simd_gt(self, other) } + } + + /// Lane-wise greater-than-or-equals comparison. + #[inline] + pub fn ge(self, other: $id) -> $bool_ty { + unsafe { simd_ge(self, other) } + } + } + } +} + +macro_rules! impl_bool_cmp { + ($id:ident, $bool_ty:ident) => { + impl $id { + /// Lane-wise equality comparison. + #[inline] + pub fn eq(self, other: $id) -> $bool_ty { + unsafe { simd_eq(self, other) } + } + + /// Lane-wise inequality comparison. + #[inline] + pub fn ne(self, other: $id) -> $bool_ty { + unsafe { simd_ne(self, other) } + } + + /// Lane-wise less-than comparison. + #[inline] + pub fn lt(self, other: $id) -> $bool_ty { + unsafe { simd_gt(self, other) } + } + + /// Lane-wise less-than-or-equals comparison. + #[inline] + pub fn le(self, other: $id) -> $bool_ty { + unsafe { simd_ge(self, other) } + } + + /// Lane-wise greater-than comparison. + #[inline] + pub fn gt(self, other: $id) -> $bool_ty { + unsafe { simd_lt(self, other) } + } + + /// Lane-wise greater-than-or-equals comparison. + #[inline] + pub fn ge(self, other: $id) -> $bool_ty { + unsafe { simd_le(self, other) } + } + } + } +} + + +#[cfg(test)] +#[macro_export] +macro_rules! test_cmp { + ($id:ident, $elem_ty:ident, $bool_ty:ident, + $true:expr, $false:expr) => { + #[test] + fn cmp() { + use ::coresimd::simd::*; + + let a = $id::splat($false); + let b = $id::splat($true); + + let r = a.lt(b); + let e = $bool_ty::splat(true); + eprintln!("0| a: {:?}, b: {:?}, r: {:?}, e: {:?}", a, b, r, e); + assert!(r == e); + let r = a.le(b); + eprintln!("1| a: {:?}, b: {:?}, r: {:?}, e: {:?}", a, b, r, e); + assert!(r == e); + + let e = $bool_ty::splat(false); + let r = a.gt(b); + eprintln!("2| a: {:?}, b: {:?}, r: {:?}, e: {:?}", a, b, r, e); + assert!(r == e); + let r = a.ge(b); + eprintln!("3| a: {:?}, b: {:?}, r: {:?}, e: {:?}", a, b, r, e); + assert!(r == e); + let r = a.eq(b); + eprintln!("4| a: {:?}, b: {:?}, r: {:?}, e: {:?}", a, b, r, e); + assert!(r == e); + + let mut a = a; + let mut b = b; + let mut e = e; + for i in 0..$id::lanes() { + if i % 2 == 0 { + a = a.replace(i, $false); + b = b.replace(i, $true); + e = e.replace(i, true); + } else { + a = a.replace(i, $true); + b = b.replace(i, $false); + e = e.replace(i, false); + } + } + let r = a.lt(b); + eprintln!("5| a: {:?}, b: {:?}, r: {:?}, e: {:?}", a, b, r, e); + assert!(r == e); + } + } +} diff --git a/coresimd/ppsv/api/default.rs b/coresimd/ppsv/api/default.rs new file mode 100644 index 0000000000..153bbe1ae3 --- /dev/null +++ b/coresimd/ppsv/api/default.rs @@ -0,0 +1,28 @@ +//! Implements `Default` for vector types. + +macro_rules! impl_default { + ($id:ident, $elem_ty:ident) => { + impl Default for $id { + #[inline] + fn default() -> Self { + Self::splat($elem_ty::default()) + } + } + } +} + +#[cfg(test)] +#[macro_export] +macro_rules! test_default { + ($id:ident, $elem_ty:ident) => { + #[test] + fn default() { + use ::coresimd::simd::*; + use std::default::Default; + let a = $id::default(); + for i in 0..$id::lanes() { + assert_eq!(a.extract(i), $elem_ty::default()); + } + } + } +} diff --git a/coresimd/ppsv/api/eq.rs b/coresimd/ppsv/api/eq.rs new file mode 100644 index 0000000000..bcbee31041 --- /dev/null +++ b/coresimd/ppsv/api/eq.rs @@ -0,0 +1,5 @@ +//! Implements `Eq` for vector types. + +macro_rules! impl_eq { + ($id:ident) => { impl Eq for $id {} } +} diff --git a/coresimd/ppsv/api/fmt.rs b/coresimd/ppsv/api/fmt.rs new file mode 100644 index 0000000000..159a049bae --- /dev/null +++ b/coresimd/ppsv/api/fmt.rs @@ -0,0 +1,53 @@ +//! Implements formating traits. + +macro_rules! impl_hex_fmt { + ($id:ident, $elem_ty:ident) => { + impl fmt::LowerHex for $id { + fn fmt(&self, f: &mut fmt::Formatter) + -> fmt::Result { + write!(f, "{}(", stringify!($id))?; + let n = mem::size_of_val(self) + / mem::size_of::<$elem_ty>(); + for i in 0..n { + if i > 0 { + write!(f, ", ")?; + } + self.extract(i).fmt(f)?; + } + write!(f, ")") + } + } + } +} + +#[cfg(test)] +macro_rules! test_hex_fmt_impl { + ($id:ident, $elem_ty:ident, $($values:expr),+) => { + #[test] + fn hex_fmt() { + use ::std::prelude::v1::*; + use ::coresimd::simd::$id; + for &i in [$($values),+].iter() { + let vec = $id::splat(i as $elem_ty); + + let s = format!("{:#x}", vec); + let beg = format!("{}(", stringify!($id)); + assert!(s.starts_with(&beg)); + assert!(s.ends_with(")")); + let s: Vec = s.replace(&beg, "").replace(")", "").split(",") + .map(|v| v.trim().to_string()).collect(); + assert_eq!(s.len(), $id::lanes()); + for (index, ss) in s.into_iter().enumerate() { + assert_eq!(ss, format!("{:#x}", vec.extract(index))); + } + } + } + } +} + +#[cfg(test)] +macro_rules! test_hex_fmt { + ($id:ident, $elem_ty:ident) => { + test_hex_fmt_impl!($id, $elem_ty, 0 as $elem_ty, !(0 as $elem_ty), (1 as $elem_ty)); + } +} diff --git a/coresimd/ppsv/api/from.rs b/coresimd/ppsv/api/from.rs new file mode 100644 index 0000000000..f1008c1ea6 --- /dev/null +++ b/coresimd/ppsv/api/from.rs @@ -0,0 +1,32 @@ +//! Implements the From trait for vector types, which performs a lane-wise +//! cast vector types with the same number of lanes. + +macro_rules! impl_from { + ($to:ident: $elem_ty:ident, $test_mod:ident | $($from:ident),+) => { + $( + impl From<::simd::$from> for $to { + #[inline] + fn from(f: ::simd::$from) -> $to { + unsafe { simd_cast(f) } + } + } + )+ + + #[cfg(test)] + mod $test_mod { + $( + #[test] + fn $from() { + use ::std::convert::{From, Into}; + use ::coresimd::simd::{$from, $to}; + use ::std::default::Default; + assert_eq!($to::lanes(), $from::lanes()); + let a: $from = $from::default(); + let b_0: $to = From::from(a); + let b_1: $to = a.into(); + assert_eq!(b_0, b_1); + } + )+ + } + } +} diff --git a/coresimd/ppsv/api/from_bits.rs b/coresimd/ppsv/api/from_bits.rs new file mode 100644 index 0000000000..43f82696dc --- /dev/null +++ b/coresimd/ppsv/api/from_bits.rs @@ -0,0 +1,39 @@ +//! Implements the `FromBits` trait for vector types, which performs bitwise +//! lossless transmutes between equally-sized vector types. + +macro_rules! impl_from_bits_ { + ($to:ident: $($from:ident),+) => { + $( + impl ::simd::FromBits<$from> for $to { + #[inline] + fn from_bits(f: $from) -> $to { + unsafe { mem::transmute(f) } + } + } + )+ + } +} + +macro_rules! impl_from_bits { + ($to:ident: $elem_ty:ident, $test_mod:ident | $($from:ident),+) => { + impl_from_bits_!($to: $($from),+); + + #[cfg(test)] + mod $test_mod { + $( + #[test] + fn $from() { + use ::coresimd::simd::{$from, $to, FromBits, IntoBits}; + use ::std::{mem, default}; + use default::Default; + assert_eq!(mem::size_of::<$from>(), + mem::size_of::<$to>()); + let a: $from = $from::default(); + let b_0: $to = FromBits::from_bits(a); + let b_1: $to = a.into_bits(); + assert_eq!(b_0, b_1); + } + )+ + } + } +} diff --git a/coresimd/ppsv/api/hash.rs b/coresimd/ppsv/api/hash.rs new file mode 100644 index 0000000000..47c135b4e1 --- /dev/null +++ b/coresimd/ppsv/api/hash.rs @@ -0,0 +1,43 @@ +//! Implements `Hash`. + +macro_rules! impl_hash { + ($id:ident, $elem_ty:ident) => { + impl hash::Hash for $id { + #[inline] + fn hash(&self, state: &mut H) { + union A { + data: [$elem_ty; $id::lanes()], + vec: $id + } + unsafe { + A { vec: *self }.data.hash(state) + } + } + } + } +} + +#[cfg(test)] +#[macro_export] +macro_rules! test_hash { + ($id:ident, $elem_ty:ident) => { + #[test] + fn hash() { + use ::coresimd::simd::$id; + use ::std::collections::hash_map::DefaultHasher; + use ::std::hash::{Hash, Hasher}; + use ::std::{mem, clone}; + use clone::Clone; + type A = [$elem_ty; $id::lanes()]; + let a: A = [42 as $elem_ty; $id::lanes()]; + assert!(mem::size_of::() == mem::size_of::<$id>()); + let mut a_hash = DefaultHasher::new(); + let mut v_hash = a_hash.clone(); + a.hash(&mut a_hash); + + let v = $id::splat(42 as $elem_ty); + v.hash(&mut v_hash); + assert_eq!(a_hash.finish(), v_hash.finish()); + } + } +} diff --git a/coresimd/ppsv/api/load_store.rs b/coresimd/ppsv/api/load_store.rs new file mode 100644 index 0000000000..fe21f74c98 --- /dev/null +++ b/coresimd/ppsv/api/load_store.rs @@ -0,0 +1,272 @@ +//! Implements the load/store API. + +macro_rules! impl_load_store { + ($id:ident, $elem_ty:ident, $elem_count:expr) => { + impl $id { + /// Writes the values of the vector to the `slice`. + /// + /// # Panics + /// + /// If `slice.len() < Self::lanes()` or `&slice[0]` is not + /// aligned to an `align_of::()` boundary. + #[inline] + pub fn store_aligned(self, slice: &mut [$elem_ty]) { + unsafe { + assert!(slice.len() >= $elem_count); + let target_ptr = slice.get_unchecked_mut(0) as *mut $elem_ty; + assert!(target_ptr.align_offset(mem::align_of::()) == 0); + self.store_aligned_unchecked(slice); + } + } + + /// Writes the values of the vector to the `slice`. + /// + /// # Panics + /// + /// If `slice.len() < Self::lanes()`. + #[inline] + pub fn store_unaligned(self, slice: &mut [$elem_ty]) { + unsafe { + assert!(slice.len() >= $elem_count); + self.store_unaligned_unchecked(slice); + } + } + + /// Writes the values of the vector to the `slice`. + /// + /// # Precondition + /// + /// If `slice.len() < Self::lanes()` or `&slice[0]` is not + /// aligned to an `align_of::()` boundary, the behavior is + /// undefined. + #[inline] + pub unsafe fn store_aligned_unchecked( + self, + slice: &mut [$elem_ty] + + ) { + *(slice.get_unchecked_mut(0) as *mut $elem_ty as *mut Self) = self; + } + + /// Writes the values of the vector to the `slice`. + /// + /// # Precondition + /// + /// If `slice.len() < Self::lanes()` the behavior is undefined. + #[inline] + pub unsafe fn store_unaligned_unchecked( + self, + slice: &mut [$elem_ty] + ) { + let target_ptr = slice.get_unchecked_mut(0) as *mut $elem_ty as *mut u8; + let self_ptr = &self as *const Self as *const u8; + ptr::copy_nonoverlapping(self_ptr, target_ptr, mem::size_of::()); + } + + /// Instantiates a new vector with the values of the `slice`. + /// + /// # Panics + /// + /// If `slice.len() < Self::lanes()` or `&slice[0]` is not aligned + /// to an `align_of::()` boundary. + #[inline] + pub fn load_aligned(slice: &[$elem_ty]) -> Self { + unsafe { + assert!(slice.len() >= $elem_count); + let target_ptr = slice.get_unchecked(0) as *const $elem_ty; + assert!(target_ptr.align_offset(mem::align_of::()) == 0); + Self::load_aligned_unchecked(slice) + } + } + + /// Instantiates a new vector with the values of the `slice`. + /// + /// # Panics + /// + /// If `slice.len() < Self::lanes()`. + #[inline] + pub fn load_unaligned(slice: &[$elem_ty]) -> Self { + unsafe { + assert!(slice.len() >= $elem_count); + Self::load_unaligned_unchecked(slice) + } + } + + /// Instantiates a new vector with the values of the `slice`. + /// + /// # Precondition + /// + /// If `slice.len() < Self::lanes()` or `&slice[0]` is not aligned + /// to an `align_of::()` boundary, the behavior is undefined. + #[inline] + pub unsafe fn load_aligned_unchecked(slice: &[$elem_ty]) -> Self { + *(slice.get_unchecked(0) as *const $elem_ty as *const Self) + } + + /// Instantiates a new vector with the values of the `slice`. + /// + /// # Precondition + /// + /// If `slice.len() < Self::lanes()` the behavior is undefined. + #[inline] + pub unsafe fn load_unaligned_unchecked(slice: &[$elem_ty]) -> Self { + use mem::size_of; + let target_ptr = slice.get_unchecked(0) as *const $elem_ty as *const u8; + let mut x = Self::splat(0 as $elem_ty); + let self_ptr = &mut x as *mut Self as *mut u8; + ptr::copy_nonoverlapping(target_ptr,self_ptr,size_of::()); + x + } + } + } +} + +#[cfg(test)] +#[macro_export] +macro_rules! test_load_store { + ($id:ident, $elem_ty:ident) => { + #[test] + fn store_unaligned() { + use ::coresimd::simd::$id; + use ::std::iter::Iterator; + let mut unaligned = [0 as $elem_ty; $id::lanes() + 1]; + let vec = $id::splat(42 as $elem_ty); + vec.store_unaligned(&mut unaligned[1..]); + for (index, &b) in unaligned.iter().enumerate() { + if index == 0 { + assert_eq!(b, 0 as $elem_ty); + } else { + assert_eq!(b, vec.extract(index - 1)); + } + } + } + + #[test] + #[should_panic] + fn store_unaligned_fail() { + use ::coresimd::simd::$id; + let mut unaligned = [0 as $elem_ty; $id::lanes() + 1]; + let vec = $id::splat(42 as $elem_ty); + vec.store_unaligned(&mut unaligned[2..]); + } + + #[test] + fn load_unaligned() { + use ::coresimd::simd::$id; + use ::std::iter::Iterator; + let mut unaligned = [42 as $elem_ty; $id::lanes() + 1]; + unaligned[0] = 0 as $elem_ty; + let vec = $id::load_unaligned(&unaligned[1..]); + for (index, &b) in unaligned.iter().enumerate() { + if index == 0 { + assert_eq!(b, 0 as $elem_ty); + } else { + assert_eq!(b, vec.extract(index - 1)); + } + } + } + + #[test] + #[should_panic] + fn load_unaligned_fail() { + use ::coresimd::simd::$id; + let mut unaligned = [42 as $elem_ty; $id::lanes() + 1]; + unaligned[0] = 0 as $elem_ty; + let _vec = $id::load_unaligned(&unaligned[2..]); + } + + union A { + data: [$elem_ty; 2 * ::coresimd::simd::$id::lanes()], + vec: ::coresimd::simd::$id, + } + + #[test] + fn store_aligned() { + use ::coresimd::simd::$id; + use ::std::iter::Iterator; + let mut aligned = A { data: [0 as $elem_ty; 2 * $id::lanes()] }; + let vec = $id::splat(42 as $elem_ty); + unsafe { vec.store_aligned(&mut aligned.data[$id::lanes()..]) }; + for (index, &b) in unsafe { aligned.data.iter().enumerate() } { + if index < $id::lanes() { + assert_eq!(b, 0 as $elem_ty); + } else { + assert_eq!(b, vec.extract(index - $id::lanes())); + } + } + } + + #[test] + #[should_panic] + fn store_aligned_fail_lanes() { + use ::coresimd::simd::$id; + let mut aligned = A { data: [0 as $elem_ty; 2 * $id::lanes()] }; + let vec = $id::splat(42 as $elem_ty); + unsafe { vec.store_aligned(&mut aligned.data[2 * $id::lanes()..]) }; + } + + #[test] + #[should_panic] + fn store_aligned_fail_align() { + unsafe { + use ::coresimd::simd::$id; + use ::std::{slice, mem}; + let mut aligned = A { data: [0 as $elem_ty; 2 * $id::lanes()] }; + // offset the aligned data by one byte: + let s: &mut [u8; 2 * $id::lanes() * mem::size_of::<$elem_ty>()] + = mem::transmute(&mut aligned.data); + let s: &mut [$elem_ty] = slice::from_raw_parts_mut( + s.get_unchecked_mut(1) as *mut u8 as *mut $elem_ty, + $id::lanes() + ); + let vec = $id::splat(42 as $elem_ty); + vec.store_aligned(s); + } + } + + #[test] + fn load_aligned() { + use ::coresimd::simd::$id; + use ::std::iter::Iterator; + let mut aligned = A { data: [0 as $elem_ty; 2 * $id::lanes()] }; + for i in $id::lanes()..(2*$id::lanes()) { + unsafe { aligned.data[i] = 42 as $elem_ty; } + } + + let vec = unsafe { $id::load_aligned(&aligned.data[$id::lanes()..]) }; + for (index, &b) in unsafe { aligned.data.iter().enumerate() } { + if index < $id::lanes() { + assert_eq!(b, 0 as $elem_ty); + } else { + assert_eq!(b, vec.extract(index - $id::lanes())); + } + } + } + + #[test] + #[should_panic] + fn load_aligned_fail_lanes() { + use ::coresimd::simd::$id; + let aligned = A { data: [0 as $elem_ty; 2 * $id::lanes()] }; + let _vec = unsafe { $id::load_aligned(&aligned.data[2 * $id::lanes()..]) }; + } + + #[test] + #[should_panic] + fn load_aligned_fail_align() { + unsafe { + use ::coresimd::simd::$id; + use ::std::{slice, mem}; + let aligned = A { data: [0 as $elem_ty; 2 * $id::lanes()] }; + // offset the aligned data by one byte: + let s: &[u8; 2 * $id::lanes() * mem::size_of::<$elem_ty>()] + = mem::transmute(&aligned.data); + let s: &[$elem_ty] = slice::from_raw_parts( + s.get_unchecked(1) as *const u8 as *const $elem_ty, + $id::lanes() + ); + let _vec = $id::load_aligned(s); + } + } + } +} diff --git a/coresimd/ppsv/api/minimal.rs b/coresimd/ppsv/api/minimal.rs new file mode 100644 index 0000000000..3def265e21 --- /dev/null +++ b/coresimd/ppsv/api/minimal.rs @@ -0,0 +1,142 @@ +//! + +/// Minimal interface: all packed SIMD vector types implement this. +macro_rules! impl_minimal { + ($id:ident, $elem_ty:ident, $elem_count:expr, $($elem_name:ident),+) => { + #[cfg_attr(feature = "cargo-clippy", allow(expl_impl_clone_on_copy))] + impl Clone for $id { + #[inline] // currently needed for correctness + fn clone(&self) -> Self { + *self + } + } + + impl $id { + /// Creates a new instance with each vector elements initialized + /// with the provided values. + #[inline] + pub const fn new($($elem_name: $elem_ty),*) -> Self { + $id($($elem_name),*) + } + + /// Returns the number of vector lanes. + #[inline] + pub const fn lanes() -> usize { + $elem_count + } + + /// Constructs a new instance with each element initialized to + /// `value`. + #[inline] + pub const fn splat(value: $elem_ty) -> Self { + $id($({ + #[allow(non_camel_case_types, dead_code)] + struct $elem_name; + value + }),*) + } + + /// Extracts the value at `index`. + /// + /// # Panics + /// + /// If `index >= Self::lanes()`. + #[inline] + pub fn extract(self, index: usize) -> $elem_ty { + assert!(index < $elem_count); + unsafe { self.extract_unchecked(index) } + } + + /// Extracts the value at `index`. + /// + /// If `index >= Self::lanes()` the behavior is undefined. + #[inline] + pub unsafe fn extract_unchecked(self, index: usize) -> $elem_ty { + simd_extract(self, index as u32) + } + + /// Returns a new vector where the value at `index` is replaced by `new_value`. + /// + /// # Panics + /// + /// If `index >= Self::lanes()`. + #[inline] + #[must_use = "replace does not modify the original value - it returns a new vector with the value at `index` replaced by `new_value`d"] + pub fn replace(self, index: usize, new_value: $elem_ty) -> Self { + assert!(index < $elem_count); + unsafe { self.replace_unchecked(index, new_value) } + } + + /// Returns a new vector where the value at `index` is replaced by `new_value`. + /// + /// # Panics + /// + /// If `index >= Self::lanes()`. + #[inline] + #[must_use = "replace_unchecked does not modify the original value - it returns a new vector with the value at `index` replaced by `new_value`d"] + pub unsafe fn replace_unchecked( + self, + index: usize, + new_value: $elem_ty, + ) -> Self { + simd_insert(self, index as u32, new_value) + } + } + } +} + +#[cfg(test)] +macro_rules! test_minimal { + ($id:ident, $elem_ty:ident, $elem_count:expr) => { + #[test] + fn minimal() { + use ::coresimd::simd::$id; + // TODO: test new + + // lanes: + assert_eq!($elem_count, $id::lanes()); + + // splat and extract / extract_unchecked: + const VAL: $elem_ty = 7 as $elem_ty; + const VEC: $id = $id::splat(VAL); + for i in 0..$id::lanes() { + assert_eq!(VAL, VEC.extract(i)); + assert_eq!(VAL, unsafe { VEC.extract_unchecked(i) }); + } + + // replace / replace_unchecked + let new_vec = VEC.replace(1, 42 as $elem_ty); + for i in 0..$id::lanes() { + if i == 1 { + assert_eq!(42 as $elem_ty, new_vec.extract(i)); + } else { + assert_eq!(VAL, new_vec.extract(i)); + } + } + let new_vec = unsafe { VEC.replace_unchecked(1, 42 as $elem_ty) }; + for i in 0..$id::lanes() { + if i == 1 { + assert_eq!(42 as $elem_ty, new_vec.extract(i)); + } else { + assert_eq!(VAL, new_vec.extract(i)); + } + } + } + #[test] + #[should_panic] + fn minimal_extract_panic_on_out_of_bounds() { + use ::coresimd::simd::$id; + const VAL: $elem_ty = 7 as $elem_ty; + const VEC: $id = $id::splat(VAL); + let _ = VEC.extract($id::lanes()); + } + #[test] + #[should_panic] + fn minimal_replace_panic_on_out_of_bounds() { + use ::coresimd::simd::$id; + const VAL: $elem_ty = 7 as $elem_ty; + const VEC: $id = $id::splat(VAL); + let _ = VEC.replace($id::lanes(), 42 as $elem_ty); + } + } +} diff --git a/coresimd/ppsv/api/minmax_reductions.rs b/coresimd/ppsv/api/minmax_reductions.rs new file mode 100644 index 0000000000..7c158ab404 --- /dev/null +++ b/coresimd/ppsv/api/minmax_reductions.rs @@ -0,0 +1,49 @@ +//! Implements portable arithmetic vector reductions. + +macro_rules! impl_minmax_reductions { + ($id:ident, $elem_ty:ident) => { + impl $id { + /// Largest vector value. + #[inline] + pub fn max(self) -> $elem_ty { + ReduceMax::reduce_max(self) + } + /// Smallest vector value. + #[inline] + pub fn min(self) -> $elem_ty { + ReduceMin::reduce_min(self) + } + } + } +} + +#[cfg(test)] +macro_rules! test_minmax_reductions { + ($id:ident, $elem_ty:ident) => { + #[test] + fn max() { + use ::coresimd::simd::$id; + let v = $id::splat(0 as $elem_ty); + assert_eq!(v.max(), 0 as $elem_ty); + let v = v.replace(1, 1 as $elem_ty); + assert_eq!(v.max(), 1 as $elem_ty); + let v = v.replace(0, 2 as $elem_ty); + assert_eq!(v.max(), 2 as $elem_ty); + } + + #[test] + fn min() { + use ::coresimd::simd::$id; + let v = $id::splat(0 as $elem_ty); + assert_eq!(v.min(), 0 as $elem_ty); + let v = v.replace(1, 1 as $elem_ty); + assert_eq!(v.min(), 0 as $elem_ty); + let v = $id::splat(1 as $elem_ty); + let v = v.replace(0, 2 as $elem_ty); + assert_eq!(v.min(), 1 as $elem_ty); + let v = $id::splat(2 as $elem_ty); + let v = v.replace(1, 1 as $elem_ty); + assert_eq!(v.min(), 1 as $elem_ty); + } + } +} diff --git a/coresimd/ppsv/api/mod.rs b/coresimd/ppsv/api/mod.rs new file mode 100644 index 0000000000..cfd201ed26 --- /dev/null +++ b/coresimd/ppsv/api/mod.rs @@ -0,0 +1,282 @@ +//! This module defines the API of portable vector types. +//! +//! # API +//! +//! ## Traits +//! +//! All portable vector types implement the following traits: +//! +//! * [x] `Copy`, +//! * [x] `Clone`, +//! * [x] `Debug`, +//! * [x] `Default` +//! * [x] `PartialEq` +//! * [x] `PartialOrd` (TODO: re-write in term of +//! comparison operations and boolean reductions), +//! +//! Non-floating-point vector types also implement: +//! +//! * [x] `Hash`, +//! * [x] `Eq`, and +//! * [x] `Ord`. +//! +//! Integer vector types also implement: +//! +//! * [x] `fmt::LowerHex`. +//! +//! ## Conversions +//! +//! * [x]: `FromBits/IntoBits`: bitwise lossless transmutes between vectors of +//! the same size (i.e., same `mem::size_of`). +//! * [x]: `From/Into`: casts between vectors with the same number of lanes +//! (potentially lossy). +//! +//! ## Inherent methods +//! +//! * [x] minimal API: implemented by all vector types except for boolean +//! vectors. +//! * [x] minimal boolean vector API: implemented by boolean vectors. +//! * [x] load/store API: aligned and unaligned memory loads and +//! stores - implemented by all vectors. +//! * [x] comparison API: vector lane-wise comparison producing +//! boolean vectors - implemented by all vectors. +//! * [x] arithmetic operations: implemented by all non-boolean vectors. +//! * [x] `std::ops::Neg`: implemented by signed-integer and floating-point +//! vectors. +//! * [x] bitwise operations: implemented by integer and boolean +//! vectors. +//! * [x] shift operations: implemented by integer vectors. +//! * [x] arithmetic reductions: implemented by integer and floating-point +//! vectors. +//! * [x] bitwise reductions: implemented by integer and boolean +//! vectors. +//! * [x] boolean reductions: implemented by boolean vectors. +//! * [ ] portable shuffles: `shufflevector`. +//! * [ ] portable `gather`/`scatter`: + +/// Adds the vector type `$id`, with elements of types `$elem_tys`. +macro_rules! define_ty { + ($id:ident, $($elem_tys:ident),+ | $(#[$doc:meta])*) => { + $(#[$doc])* + #[repr(simd)] + #[derive(Copy, Debug, /*FIXME:*/ PartialOrd)] + #[allow(non_camel_case_types)] + pub struct $id($($elem_tys),*); + } +} + +#[macro_use] +mod arithmetic_ops; +#[macro_use] +mod arithmetic_reductions; +#[macro_use] +mod bitwise_ops; +#[macro_use] +mod bitwise_reductions; +#[macro_use] +mod boolean_reductions; +#[macro_use] +mod bool_vectors; +#[macro_use] +mod cmp; +#[macro_use] +mod default; +#[macro_use] +mod eq; +#[macro_use] +mod fmt; +#[macro_use] +mod from; +#[macro_use] +mod from_bits; +#[macro_use] +mod hash; +#[macro_use] +mod load_store; +#[macro_use] +mod minimal; +#[macro_use] +mod minmax_reductions; +#[macro_use] +mod neg; +#[macro_use] +mod partial_eq; +// TODO: +//#[macro_use] +//mod partial_ord; +// TODO: +//#[macro_use] +//mod shuffles; +// TODO: +//#[macro_use] +//mod gather_scatter; +#[macro_use] +mod shifts; + +/// Imports required to implement vector types using the macros. + +macro_rules! simd_api_imports { + () => { + use ::coresimd::simd_llvm::*; + use fmt; + use hash; + use ops; + #[allow(unused_imports)] + use num; + use cmp::{Eq, PartialEq}; + use ptr; + use mem; + #[allow(unused_imports)] + use convert::{From, Into}; + use slice::SliceExt; + #[allow(unused_imports)] + use iter::Iterator; + #[allow(unused_imports)] + use default::Default; + use clone::Clone; + use super::codegen::sum::{ReduceAdd}; + use super::codegen::product::{ReduceMul}; + use super::codegen::and::{ReduceAnd}; + use super::codegen::or::{ReduceOr}; + use super::codegen::xor::{ReduceXor}; + use super::codegen::min::{ReduceMin}; + use super::codegen::max::{ReduceMax}; + } +} + +/// Defines a portable packed SIMD floating-point vector type. +macro_rules! simd_f_ty { + ($id:ident : $elem_count:expr, $elem_ty:ident, $bool_ty:ident, $test_mod:ident | + $($elem_tys:ident),+ | $($elem_name:ident),+ | $(#[$doc:meta])*) => { + define_ty!($id, $($elem_tys),+ | $(#[$doc])*); + impl_minimal!($id, $elem_ty, $elem_count, $($elem_name),*); + impl_load_store!($id, $elem_ty, $elem_count); + impl_cmp!($id, $bool_ty); + impl_arithmetic_ops!($id); + impl_arithmetic_reductions!($id, $elem_ty); + impl_minmax_reductions!($id, $elem_ty); + impl_neg_op!($id, $elem_ty); + impl_partial_eq!($id); + impl_default!($id, $elem_ty); + + #[cfg(test)] + mod $test_mod { + test_minimal!($id, $elem_ty, $elem_count); + test_load_store!($id, $elem_ty); + test_cmp!($id, $elem_ty, $bool_ty, 1. as $elem_ty, 0. as $elem_ty); + test_arithmetic_ops!($id, $elem_ty); + test_arithmetic_reductions!($id, $elem_ty); + test_minmax_reductions!($id, $elem_ty); + test_neg_op!($id, $elem_ty); + test_partial_eq!($id, 1. as $elem_ty, 0. as $elem_ty); + test_default!($id, $elem_ty); + } + } +} + +/// Defines a portable packed SIMD signed-integer vector type. +macro_rules! simd_i_ty { + ($id:ident : $elem_count:expr, $elem_ty:ident, $bool_ty:ident, $test_mod:ident | + $($elem_tys:ident),+ | $($elem_name:ident),+ | $(#[$doc:meta])*) => { + define_ty!($id, $($elem_tys),+ | $(#[$doc])*); + impl_minimal!($id, $elem_ty, $elem_count, $($elem_name),*); + impl_load_store!($id, $elem_ty, $elem_count); + impl_cmp!($id, $bool_ty); + impl_hash!($id, $elem_ty); + impl_arithmetic_ops!($id); + impl_arithmetic_reductions!($id, $elem_ty); + impl_minmax_reductions!($id, $elem_ty); + impl_neg_op!($id, $elem_ty); + impl_bitwise_ops!($id, !(0 as $elem_ty)); + impl_bitwise_reductions!($id, $elem_ty); + impl_all_shifts!($id, $elem_ty); + impl_hex_fmt!($id, $elem_ty); + impl_eq!($id); + impl_partial_eq!($id); + impl_default!($id, $elem_ty); + + #[cfg(test)] + mod $test_mod { + test_minimal!($id, $elem_ty, $elem_count); + test_load_store!($id, $elem_ty); + test_cmp!($id, $elem_ty, $bool_ty, 1 as $elem_ty, 0 as $elem_ty); + test_hash!($id, $elem_ty); + test_arithmetic_ops!($id, $elem_ty); + test_arithmetic_reductions!($id, $elem_ty); + test_minmax_reductions!($id, $elem_ty); + test_neg_op!($id, $elem_ty); + test_int_bitwise_ops!($id, $elem_ty); + test_bitwise_reductions!($id, !(0 as $elem_ty)); + test_all_shift_ops!($id, $elem_ty); + test_hex_fmt!($id, $elem_ty); + test_partial_eq!($id, 1 as $elem_ty, 0 as $elem_ty); + test_default!($id, $elem_ty); + } + } +} + +/// Defines a portable packed SIMD unsigned-integer vector type. +macro_rules! simd_u_ty { + ($id:ident : $elem_count:expr, $elem_ty:ident, $bool_ty:ident, $test_mod:ident | + $($elem_tys:ident),+ | $($elem_name:ident),+ | $(#[$doc:meta])*) => { + define_ty!($id, $($elem_tys),+ | $(#[$doc])*); + impl_minimal!($id, $elem_ty, $elem_count, $($elem_name),*); + impl_load_store!($id, $elem_ty, $elem_count); + impl_cmp!($id, $bool_ty); + impl_hash!($id, $elem_ty); + impl_arithmetic_ops!($id); + impl_arithmetic_reductions!($id, $elem_ty); + impl_minmax_reductions!($id, $elem_ty); + impl_bitwise_ops!($id, !(0 as $elem_ty)); + impl_bitwise_reductions!($id, $elem_ty); + impl_all_shifts!($id, $elem_ty); + impl_hex_fmt!($id, $elem_ty); + impl_eq!($id); + impl_partial_eq!($id); + impl_default!($id, $elem_ty); + + #[cfg(test)] + mod $test_mod { + test_minimal!($id, $elem_ty, $elem_count); + test_load_store!($id, $elem_ty); + test_cmp!($id, $elem_ty, $bool_ty, 1 as $elem_ty, 0 as $elem_ty); + test_hash!($id, $elem_ty); + test_arithmetic_ops!($id, $elem_ty); + test_arithmetic_reductions!($id, $elem_ty); + test_minmax_reductions!($id, $elem_ty); + test_int_bitwise_ops!($id, $elem_ty); + test_bitwise_reductions!($id, !(0 as $elem_ty)); + test_all_shift_ops!($id, $elem_ty); + test_hex_fmt!($id, $elem_ty); + test_partial_eq!($id, 1 as $elem_ty, 0 as $elem_ty); + test_default!($id, $elem_ty); + } + } +} + +/// Defines a portable packed SIMD boolean vector type. +macro_rules! simd_b_ty { + ($id:ident : $elem_count:expr, $elem_ty:ident, $test_mod:ident | + $($elem_tys:ident),+ | $($elem_name:ident),+ | $(#[$doc:meta])*) => { + define_ty!($id, $($elem_tys),+ | $(#[$doc])*); + impl_bool_minimal!($id, $elem_ty, $elem_count, $($elem_name),*); + impl_bitwise_ops!($id, true); + impl_bool_bitwise_reductions!($id, bool); + impl_bool_reductions!($id); + impl_bool_cmp!($id, $id); + impl_eq!($id); + impl_partial_eq!($id); + impl_default!($id, bool); + + #[cfg(test)] + mod $test_mod { + test_bool_minimal!($id, $elem_count); + test_bool_bitwise_ops!($id); + test_bool_reductions!($id); + test_bitwise_reductions!($id, true); + test_cmp!($id, $elem_ty, $id, true, false); + test_partial_eq!($id, true, false); + test_default!($id, bool); + } + } +} diff --git a/coresimd/ppsv/api/neg.rs b/coresimd/ppsv/api/neg.rs new file mode 100644 index 0000000000..aa1cffbf7f --- /dev/null +++ b/coresimd/ppsv/api/neg.rs @@ -0,0 +1,43 @@ +//! Implements `std::ops::Neg` for signed vector types. + +macro_rules! impl_neg_op { + ($id:ident, $elem_ty:ident) => { + impl ops::Neg for $id { + type Output = Self; + #[inline] + fn neg(self) -> Self { + Self::splat(-1 as $elem_ty) * self + } + } + }; +} + +#[cfg(test)] +#[macro_export] +macro_rules! test_neg_op { + ($id:ident, $elem_ty:ident) => { + #[test] + fn neg() { + use ::coresimd::simd::$id; + let z = $id::splat(0 as $elem_ty); + let o = $id::splat(1 as $elem_ty); + let t = $id::splat(2 as $elem_ty); + let f = $id::splat(4 as $elem_ty); + + let nz = $id::splat(-(0 as $elem_ty)); + let no = $id::splat(-(1 as $elem_ty)); + let nt = $id::splat(-(2 as $elem_ty)); + let nf = $id::splat(-(4 as $elem_ty)); + + assert_eq!(-z, nz); + assert_eq!(-o, no); + assert_eq!(-t, nt); + assert_eq!(-f, nf); + + assert_eq!(z, -nz); + assert_eq!(o, -no); + assert_eq!(t, -nt); + assert_eq!(f, -nf); + } + }; +} diff --git a/coresimd/ppsv/api/partial_eq.rs b/coresimd/ppsv/api/partial_eq.rs new file mode 100644 index 0000000000..70e7a9f966 --- /dev/null +++ b/coresimd/ppsv/api/partial_eq.rs @@ -0,0 +1,35 @@ +//! Implements `PartialEq` for vector types. + +macro_rules! impl_partial_eq { + ($id:ident) => { + impl PartialEq<$id> for $id { + #[inline] + fn eq(&self, other: &Self) -> bool { + $id::eq(*self, *other).all() + } + #[inline] + fn ne(&self, other: &Self) -> bool { + $id::ne(*self, *other).all() + } + } + } +} + +#[cfg(test)] +#[macro_export] +macro_rules! test_partial_eq { + ($id:ident, $true:expr, $false:expr) => { + #[test] + fn partial_eq() { + use ::coresimd::simd::*; + + let a = $id::splat($false); + let b = $id::splat($true); + + assert!(a != b); + assert!(!(a == b)); + assert!(a == a); + assert!(!(a != a)); + } + } +} diff --git a/coresimd/ppsv/api/shifts.rs b/coresimd/ppsv/api/shifts.rs new file mode 100644 index 0000000000..1447447eea --- /dev/null +++ b/coresimd/ppsv/api/shifts.rs @@ -0,0 +1,122 @@ +//! Implements integer shifts. + +macro_rules! impl_shifts { + ($id:ident, $elem_ty:ident, $($by:ident),+) => { + $( + impl ::ops::Shl<$by> for $id { + type Output = Self; + #[inline] + fn shl(self, other: $by) -> Self { + unsafe { simd_shl(self, $id::splat(other as $elem_ty)) } + } + } + impl ::ops::Shr<$by> for $id { + type Output = Self; + #[inline] + fn shr(self, other: $by) -> Self { + unsafe { simd_shr(self, $id::splat(other as $elem_ty)) } + } + } + + impl ::ops::ShlAssign<$by> for $id { + #[inline] + fn shl_assign(&mut self, other: $by) { + *self = *self << other; + } + } + impl ::ops::ShrAssign<$by> for $id { + #[inline] + fn shr_assign(&mut self, other: $by) { + *self = *self >> other; + } + } + + )+ + } +} + +macro_rules! impl_all_shifts { + ($id:ident, $elem_ty:ident) => { + impl_shifts!( + $id, $elem_ty, + u8, u16, u32, u64, usize, + i8, i16, i32, i64, isize); + + } +} + +#[cfg(test)] +#[macro_export] +macro_rules! test_shift_ops { + ($id:ident, $elem_ty:ident, $($index_ty:ident),+) => { + #[test] + fn shift_ops() { + use ::coresimd::simd::$id; + use ::std::mem; + let z = $id::splat(0 as $elem_ty); + let o = $id::splat(1 as $elem_ty); + let t = $id::splat(2 as $elem_ty); + let f = $id::splat(4 as $elem_ty); + + $( + { + let zi = 0 as $index_ty; + let oi = 1 as $index_ty; + let ti = 2 as $index_ty; + let maxi = (mem::size_of::<$elem_ty>() * 8 - 1) as $index_ty; + + // shr + assert_eq!(z >> zi, z); + assert_eq!(z >> oi, z); + assert_eq!(z >> ti, z); + assert_eq!(z >> ti, z); + + assert_eq!(o >> zi, o); + assert_eq!(t >> zi, t); + assert_eq!(f >> zi, f); + assert_eq!(f >> maxi, z); + + assert_eq!(o >> oi, z); + assert_eq!(t >> oi, o); + assert_eq!(t >> ti, z); + assert_eq!(f >> oi, t); + assert_eq!(f >> ti, o); + assert_eq!(f >> maxi, z); + + // shl + assert_eq!(z << zi, z); + assert_eq!(o << zi, o); + assert_eq!(t << zi, t); + assert_eq!(f << zi, f); + assert_eq!(f << maxi, z); + + assert_eq!(o << oi, t); + assert_eq!(o << ti, f); + assert_eq!(t << oi, f); + + { // shr_assign + let mut v = o; + v >>= oi; + assert_eq!(v, z); + } + { // shl_assign + let mut v = o; + v <<= oi; + assert_eq!(v, t); + } + } + )+ + } + }; +} + +#[cfg(test)] +#[macro_export] +macro_rules! test_all_shift_ops { + ($id:ident, $elem_ty:ident) => { + test_shift_ops!( + $id, $elem_ty, + u8, u16, u32, u64, usize, + i8, i16, i32, i64, isize); + } +} diff --git a/coresimd/ppsv/codegen/and.rs b/coresimd/ppsv/codegen/and.rs new file mode 100644 index 0000000000..c149a798fd --- /dev/null +++ b/coresimd/ppsv/codegen/and.rs @@ -0,0 +1,170 @@ +//! Code generation for the and reduction. +use ::coresimd::simd::*; + +/// LLVM intrinsics used in the and reduction +#[allow(improper_ctypes)] +extern "C" { + #[link_name = "llvm.experimental.vector.reduce.and.i8.v2i8"] + fn reduce_and_i8x2(x: i8x2) -> i8; + #[link_name = "llvm.experimental.vector.reduce.and.u8.v2u8"] + fn reduce_and_u8x2(x: u8x2) -> u8; + #[link_name = "llvm.experimental.vector.reduce.and.i16.v2i16"] + fn reduce_and_i16x2(x: i16x2) -> i16; + #[link_name = "llvm.experimental.vector.reduce.and.u16.v2u16"] + fn reduce_and_u16x2(x: u16x2) -> u16; + #[link_name = "llvm.experimental.vector.reduce.and.i32.v2i32"] + fn reduce_and_i32x2(x: i32x2) -> i32; + #[link_name = "llvm.experimental.vector.reduce.and.u32.v2u32"] + fn reduce_and_u32x2(x: u32x2) -> u32; + #[link_name = "llvm.experimental.vector.reduce.and.i64.v2i64"] + fn reduce_and_i64x2(x: i64x2) -> i64; + #[link_name = "llvm.experimental.vector.reduce.and.u64.v2u64"] + fn reduce_and_u64x2(x: u64x2) -> u64; + #[link_name = "llvm.experimental.vector.reduce.and.i8.v4i8"] + fn reduce_and_i8x4(x: i8x4) -> i8; + #[link_name = "llvm.experimental.vector.reduce.and.u8.v4u8"] + fn reduce_and_u8x4(x: u8x4) -> u8; + #[link_name = "llvm.experimental.vector.reduce.and.i16.v4i16"] + fn reduce_and_i16x4(x: i16x4) -> i16; + #[link_name = "llvm.experimental.vector.reduce.and.u16.v4u16"] + fn reduce_and_u16x4(x: u16x4) -> u16; + #[link_name = "llvm.experimental.vector.reduce.and.i32.v4i32"] + fn reduce_and_i32x4(x: i32x4) -> i32; + #[link_name = "llvm.experimental.vector.reduce.and.u32.v4u32"] + fn reduce_and_u32x4(x: u32x4) -> u32; + #[link_name = "llvm.experimental.vector.reduce.and.i64.v4i64"] + fn reduce_and_i64x4(x: i64x4) -> i64; + #[link_name = "llvm.experimental.vector.reduce.and.u64.v4u64"] + fn reduce_and_u64x4(x: u64x4) -> u64; + #[link_name = "llvm.experimental.vector.reduce.and.i8.v8i8"] + fn reduce_and_i8x8(x: i8x8) -> i8; + #[link_name = "llvm.experimental.vector.reduce.and.u8.v8u8"] + fn reduce_and_u8x8(x: u8x8) -> u8; + #[link_name = "llvm.experimental.vector.reduce.and.i16.v8i16"] + fn reduce_and_i16x8(x: i16x8) -> i16; + #[link_name = "llvm.experimental.vector.reduce.and.u16.v8u16"] + fn reduce_and_u16x8(x: u16x8) -> u16; + #[link_name = "llvm.experimental.vector.reduce.and.i32.v8i32"] + fn reduce_and_i32x8(x: i32x8) -> i32; + #[link_name = "llvm.experimental.vector.reduce.and.u32.v8u32"] + fn reduce_and_u32x8(x: u32x8) -> u32; + #[link_name = "llvm.experimental.vector.reduce.and.i64.v8i64"] + fn reduce_and_i64x8(x: i64x8) -> i64; + #[link_name = "llvm.experimental.vector.reduce.and.u64.v8u64"] + fn reduce_and_u64x8(x: u64x8) -> u64; + #[link_name = "llvm.experimental.vector.reduce.and.i8.v16i8"] + fn reduce_and_i8x16(x: i8x16) -> i8; + #[link_name = "llvm.experimental.vector.reduce.and.u8.v16u8"] + fn reduce_and_u8x16(x: u8x16) -> u8; + #[link_name = "llvm.experimental.vector.reduce.and.i16.v16i16"] + fn reduce_and_i16x16(x: i16x16) -> i16; + #[link_name = "llvm.experimental.vector.reduce.and.u16.v16u16"] + fn reduce_and_u16x16(x: u16x16) -> u16; + #[link_name = "llvm.experimental.vector.reduce.and.i32.v16i32"] + fn reduce_and_i32x16(x: i32x16) -> i32; + #[link_name = "llvm.experimental.vector.reduce.and.u32.v16u32"] + fn reduce_and_u32x16(x: u32x16) -> u32; + #[link_name = "llvm.experimental.vector.reduce.and.i8.v32i8"] + fn reduce_and_i8x32(x: i8x32) -> i8; + #[link_name = "llvm.experimental.vector.reduce.and.u8.v32u8"] + fn reduce_and_u8x32(x: u8x32) -> u8; + #[link_name = "llvm.experimental.vector.reduce.and.i16.v32i16"] + fn reduce_and_i16x32(x: i16x32) -> i16; + #[link_name = "llvm.experimental.vector.reduce.and.u16.v32u16"] + fn reduce_and_u16x32(x: u16x32) -> u16; + #[link_name = "llvm.experimental.vector.reduce.and.i8.v64i8"] + fn reduce_and_i8x64(x: i8x64) -> i8; + #[link_name = "llvm.experimental.vector.reduce.and.u8.v64u8"] + fn reduce_and_u8x64(x: u8x64) -> u8; +} + +/// Reduction: horizontal bitwise and of the vector elements. +#[cfg_attr(feature = "cargo-clippy", allow(stutter))] +pub trait ReduceAnd { + /// Result type of the reduction. + type Acc; + /// Computes the horizontal bitwise and of the vector elements + fn reduce_and(self) -> Self::Acc; +} + +macro_rules! red_and { + ($id:ident, $elem_ty:ident, $llvm_intr:ident) => { + impl ReduceAnd for $id { + type Acc = $elem_ty; + #[cfg(not(target_arch = "aarch64"))] + #[inline] + fn reduce_and(self) -> Self::Acc { + unsafe { $llvm_intr(self.into_bits()) } + } + // FIXME: broken in AArch64 + #[cfg(target_arch = "aarch64")] + #[inline] + fn reduce_and(self) -> Self::Acc { + let mut x = self.extract(0) as Self::Acc; + for i in 1..$id::lanes() { + x &= self.extract(i) as Self::Acc; + } + x + } + } + }; +} +red_and!(i8x2, i8, reduce_and_i8x2); +red_and!(u8x2, u8, reduce_and_u8x2); +red_and!(i16x2, i16, reduce_and_i16x2); +red_and!(u16x2, u16, reduce_and_u16x2); +red_and!(i32x2, i32, reduce_and_i32x2); +red_and!(u32x2, u32, reduce_and_u32x2); +red_and!(i64x2, i64, reduce_and_i64x2); +red_and!(u64x2, u64, reduce_and_u64x2); +red_and!(i8x4, i8, reduce_and_i8x4); +red_and!(u8x4, u8, reduce_and_u8x4); +red_and!(i16x4, i16, reduce_and_i16x4); +red_and!(u16x4, u16, reduce_and_u16x4); +red_and!(i32x4, i32, reduce_and_i32x4); +red_and!(u32x4, u32, reduce_and_u32x4); +red_and!(i64x4, i64, reduce_and_i64x4); +red_and!(u64x4, u64, reduce_and_u64x4); +red_and!(i8x8, i8, reduce_and_i8x8); +red_and!(u8x8, u8, reduce_and_u8x8); +red_and!(i16x8, i16, reduce_and_i16x8); +red_and!(u16x8, u16, reduce_and_u16x8); +red_and!(i32x8, i32, reduce_and_i32x8); +red_and!(u32x8, u32, reduce_and_u32x8); +red_and!(i64x8, i64, reduce_and_i64x8); +red_and!(u64x8, u64, reduce_and_u64x8); +red_and!(i8x16, i8, reduce_and_i8x16); +red_and!(u8x16, u8, reduce_and_u8x16); +red_and!(i16x16, i16, reduce_and_i16x16); +red_and!(u16x16, u16, reduce_and_u16x16); +red_and!(i32x16, i32, reduce_and_i32x16); +red_and!(u32x16, u32, reduce_and_u32x16); +red_and!(i8x32, i8, reduce_and_i8x32); +red_and!(u8x32, u8, reduce_and_u8x32); +red_and!(i16x32, i16, reduce_and_i16x32); +red_and!(u16x32, u16, reduce_and_u16x32); +red_and!(i8x64, i8, reduce_and_i8x64); +red_and!(u8x64, u8, reduce_and_u8x64); + +red_and!(b8x2, i8, reduce_and_i8x2); +red_and!(b8x4, i8, reduce_and_i8x4); +red_and!(b8x8, i8, reduce_and_i8x8); +red_and!(b8x16, i8, reduce_and_i8x16); +red_and!(b8x32, i8, reduce_and_i8x32); +red_and!(b8x64, i8, reduce_and_i8x64); + +#[cfg(test)] +mod tests { + use super::ReduceAnd; + use ::coresimd::simd::*; + + // note: these are tested in the portable vector API tests + + #[test] + fn reduce_and_i32x4() { + let v = i32x4::splat(1); + assert_eq!(v.reduce_and(), 1_i32); + let v = i32x4::new(1, 1, 0, 1); + assert_eq!(v.reduce_and(), 0_i32); + } +} diff --git a/coresimd/ppsv/codegen/max.rs b/coresimd/ppsv/codegen/max.rs new file mode 100644 index 0000000000..7ec4884b5a --- /dev/null +++ b/coresimd/ppsv/codegen/max.rs @@ -0,0 +1,196 @@ +//! Code generation for the max reduction. +use ::coresimd::simd::*; + +/// LLVM intrinsics used in the max reduction +#[allow(improper_ctypes)] +extern "C" { + #[link_name = "llvm.experimental.vector.reduce.smax.i8.v2i8"] + fn reduce_max_i8x2(x: i8x2) -> i8; + #[link_name = "llvm.experimental.vector.reduce.umax.u8.v2u8"] + fn reduce_max_u8x2(x: u8x2) -> u8; + #[link_name = "llvm.experimental.vector.reduce.smax.i16.v2i16"] + fn reduce_max_i16x2(x: i16x2) -> i16; + #[link_name = "llvm.experimental.vector.reduce.umax.u16.v2u16"] + fn reduce_max_u16x2(x: u16x2) -> u16; + #[link_name = "llvm.experimental.vector.reduce.smax.i32.v2i32"] + fn reduce_max_i32x2(x: i32x2) -> i32; + #[link_name = "llvm.experimental.vector.reduce.umax.u32.v2u32"] + fn reduce_max_u32x2(x: u32x2) -> u32; + #[link_name = "llvm.experimental.vector.reduce.smax.i64.v2i64"] + fn reduce_max_i64x2(x: i64x2) -> i64; + #[link_name = "llvm.experimental.vector.reduce.umax.u64.v2u64"] + fn reduce_max_u64x2(x: u64x2) -> u64; + #[link_name = "llvm.experimental.vector.reduce.smax.i8.v4i8"] + fn reduce_max_i8x4(x: i8x4) -> i8; + #[link_name = "llvm.experimental.vector.reduce.umax.u8.v4u8"] + fn reduce_max_u8x4(x: u8x4) -> u8; + #[link_name = "llvm.experimental.vector.reduce.smax.i16.v4i16"] + fn reduce_max_i16x4(x: i16x4) -> i16; + #[link_name = "llvm.experimental.vector.reduce.umax.u16.v4u16"] + fn reduce_max_u16x4(x: u16x4) -> u16; + #[link_name = "llvm.experimental.vector.reduce.smax.i32.v4i32"] + fn reduce_max_i32x4(x: i32x4) -> i32; + #[link_name = "llvm.experimental.vector.reduce.umax.u32.v4u32"] + fn reduce_max_u32x4(x: u32x4) -> u32; + #[link_name = "llvm.experimental.vector.reduce.smax.i64.v4i64"] + fn reduce_max_i64x4(x: i64x4) -> i64; + #[link_name = "llvm.experimental.vector.reduce.umax.u64.v4u64"] + fn reduce_max_u64x4(x: u64x4) -> u64; + #[link_name = "llvm.experimental.vector.reduce.smax.i8.v8i8"] + fn reduce_max_i8x8(x: i8x8) -> i8; + #[link_name = "llvm.experimental.vector.reduce.umax.u8.v8u8"] + fn reduce_max_u8x8(x: u8x8) -> u8; + #[link_name = "llvm.experimental.vector.reduce.smax.i16.v8i16"] + fn reduce_max_i16x8(x: i16x8) -> i16; + #[link_name = "llvm.experimental.vector.reduce.umax.u16.v8u16"] + fn reduce_max_u16x8(x: u16x8) -> u16; + #[link_name = "llvm.experimental.vector.reduce.smax.i32.v8i32"] + fn reduce_max_i32x8(x: i32x8) -> i32; + #[link_name = "llvm.experimental.vector.reduce.umax.u32.v8u32"] + fn reduce_max_u32x8(x: u32x8) -> u32; + #[link_name = "llvm.experimental.vector.reduce.smax.i64.v8i64"] + fn reduce_max_i64x8(x: i64x8) -> i64; + #[link_name = "llvm.experimental.vector.reduce.umax.u64.v8u64"] + fn reduce_max_u64x8(x: u64x8) -> u64; + #[link_name = "llvm.experimental.vector.reduce.smax.i8.v16i8"] + fn reduce_max_i8x16(x: i8x16) -> i8; + #[link_name = "llvm.experimental.vector.reduce.umax.u8.v16u8"] + fn reduce_max_u8x16(x: u8x16) -> u8; + #[link_name = "llvm.experimental.vector.reduce.smax.i16.v16i16"] + fn reduce_max_i16x16(x: i16x16) -> i16; + #[link_name = "llvm.experimental.vector.reduce.umax.u16.v16u16"] + fn reduce_max_u16x16(x: u16x16) -> u16; + #[link_name = "llvm.experimental.vector.reduce.smax.i32.v16i32"] + fn reduce_max_i32x16(x: i32x16) -> i32; + #[link_name = "llvm.experimental.vector.reduce.umax.u32.v16u32"] + fn reduce_max_u32x16(x: u32x16) -> u32; + #[link_name = "llvm.experimental.vector.reduce.smax.i8.v32i8"] + fn reduce_max_i8x32(x: i8x32) -> i8; + #[link_name = "llvm.experimental.vector.reduce.umax.u8.v32u8"] + fn reduce_max_u8x32(x: u8x32) -> u8; + #[link_name = "llvm.experimental.vector.reduce.smax.i16.v32i16"] + fn reduce_max_i16x32(x: i16x32) -> i16; + #[link_name = "llvm.experimental.vector.reduce.umax.u16.v32u16"] + fn reduce_max_u16x32(x: u16x32) -> u16; + #[link_name = "llvm.experimental.vector.reduce.smax.i8.v64i8"] + fn reduce_max_i8x64(x: i8x64) -> i8; + #[link_name = "llvm.experimental.vector.reduce.umax.u8.v64u8"] + fn reduce_max_u8x64(x: u8x64) -> u8; + #[link_name = "llvm.experimental.vector.reduce.fmax.f32.v2f32"] + fn reduce_fmax_f32x2(x: f32x2) -> f32; + #[link_name = "llvm.experimental.vector.reduce.fmax.f64.v2f64"] + fn reduce_fmax_f64x2(x: f64x2) -> f64; + #[link_name = "llvm.experimental.vector.reduce.fmax.f32.v4f32"] + fn reduce_fmax_f32x4(x: f32x4) -> f32; + #[link_name = "llvm.experimental.vector.reduce.fmax.f64.v4f64"] + fn reduce_fmax_f64x4(x: f64x4) -> f64; + #[link_name = "llvm.experimental.vector.reduce.fmax.f32.v8f32"] + fn reduce_fmax_f32x8(x: f32x8) -> f32; + #[link_name = "llvm.experimental.vector.reduce.fmax.f64.v8f64"] + fn reduce_fmax_f64x8(x: f64x8) -> f64; + #[link_name = "llvm.experimental.vector.reduce.fmax.f32.v16f32"] + fn reduce_fmax_f32x16(x: f32x16) -> f32; +} + +/// Reduction: horizontal max of the vector elements. +#[cfg_attr(feature = "cargo-clippy", allow(stutter))] +pub trait ReduceMax { + /// Result type of the reduction. + type Acc; + /// Computes the horizontal max of the vector elements. + fn reduce_max(self) -> Self::Acc; +} + +macro_rules! red_max { + ($id:ident, $elem_ty:ident, $llvm_intr:ident) => { + impl ReduceMax for $id { + type Acc = $elem_ty; + #[cfg(not(target_arch = "aarch64"))] + #[inline] + fn reduce_max(self) -> Self::Acc { + unsafe { $llvm_intr(self) } + } + // FIXME: broken on AArch64 + #[cfg(target_arch = "aarch64")] + #[allow(unused_imports)] + #[inline] + fn reduce_max(self) -> Self::Acc { + use num::Float; + use cmp::Ord; + let mut x = self.extract(0); + for i in 1..$id::lanes() { + x = x.max(self.extract(i)); + } + x + } + } + }; +} +red_max!(i8x2, i8, reduce_max_i8x2); +red_max!(u8x2, u8, reduce_max_u8x2); +red_max!(i16x2, i16, reduce_max_i16x2); +red_max!(u16x2, u16, reduce_max_u16x2); +red_max!(i32x2, i32, reduce_max_i32x2); +red_max!(u32x2, u32, reduce_max_u32x2); +red_max!(i64x2, i64, reduce_max_i64x2); +red_max!(u64x2, u64, reduce_max_u64x2); +red_max!(i8x4, i8, reduce_max_i8x4); +red_max!(u8x4, u8, reduce_max_u8x4); +red_max!(i16x4, i16, reduce_max_i16x4); +red_max!(u16x4, u16, reduce_max_u16x4); +red_max!(i32x4, i32, reduce_max_i32x4); +red_max!(u32x4, u32, reduce_max_u32x4); +red_max!(i64x4, i64, reduce_max_i64x4); +red_max!(u64x4, u64, reduce_max_u64x4); +red_max!(i8x8, i8, reduce_max_i8x8); +red_max!(u8x8, u8, reduce_max_u8x8); +red_max!(i16x8, i16, reduce_max_i16x8); +red_max!(u16x8, u16, reduce_max_u16x8); +red_max!(i32x8, i32, reduce_max_i32x8); +red_max!(u32x8, u32, reduce_max_u32x8); +red_max!(i64x8, i64, reduce_max_i64x8); +red_max!(u64x8, u64, reduce_max_u64x8); +red_max!(i8x16, i8, reduce_max_i8x16); +red_max!(u8x16, u8, reduce_max_u8x16); +red_max!(i16x16, i16, reduce_max_i16x16); +red_max!(u16x16, u16, reduce_max_u16x16); +red_max!(i32x16, i32, reduce_max_i32x16); +red_max!(u32x16, u32, reduce_max_u32x16); +red_max!(i8x32, i8, reduce_max_i8x32); +red_max!(u8x32, u8, reduce_max_u8x32); +red_max!(i16x32, i16, reduce_max_i16x32); +red_max!(u16x32, u16, reduce_max_u16x32); +red_max!(i8x64, i8, reduce_max_i8x64); +red_max!(u8x64, u8, reduce_max_u8x64); + +red_max!(f32x2, f32, reduce_fmax_f32x2); +red_max!(f64x2, f64, reduce_fmax_f64x2); +red_max!(f32x4, f32, reduce_fmax_f32x4); +red_max!(f64x4, f64, reduce_fmax_f64x4); +red_max!(f32x8, f32, reduce_fmax_f32x8); +red_max!(f64x8, f64, reduce_fmax_f64x8); +red_max!(f32x16, f32, reduce_fmax_f32x16); + +#[cfg(test)] +mod tests { + use super::ReduceMax; + use ::coresimd::simd::*; + + // note: these are tested in the portable vector API tests + + #[test] + fn reduce_max_i32x4() { + let v = i32x4::new(1, 2, -1, 3); + assert_eq!(v.reduce_max(), 3_i32); + } + #[test] + fn reduce_max_u32x4() { + let v = u32x4::new(4, 2, 7, 3); + assert_eq!(v.reduce_max(), 7_u32); + } + #[test] + fn reduce_max_f32x4() { + let v = f32x4::new(4., 2., -1., 3.); + assert_eq!(v.reduce_max(), 4.); + } +} diff --git a/coresimd/ppsv/codegen/min.rs b/coresimd/ppsv/codegen/min.rs new file mode 100644 index 0000000000..66e53a7057 --- /dev/null +++ b/coresimd/ppsv/codegen/min.rs @@ -0,0 +1,196 @@ +//! Code generation for the min reduction. +use ::coresimd::simd::*; + +/// LLVM intrinsics used in the min reduction +#[allow(improper_ctypes)] +extern "C" { + #[link_name = "llvm.experimental.vector.reduce.smin.i8.v2i8"] + fn reduce_min_i8x2(x: i8x2) -> i8; + #[link_name = "llvm.experimental.vector.reduce.umin.u8.v2u8"] + fn reduce_min_u8x2(x: u8x2) -> u8; + #[link_name = "llvm.experimental.vector.reduce.smin.i16.v2i16"] + fn reduce_min_i16x2(x: i16x2) -> i16; + #[link_name = "llvm.experimental.vector.reduce.umin.u16.v2u16"] + fn reduce_min_u16x2(x: u16x2) -> u16; + #[link_name = "llvm.experimental.vector.reduce.smin.i32.v2i32"] + fn reduce_min_i32x2(x: i32x2) -> i32; + #[link_name = "llvm.experimental.vector.reduce.umin.u32.v2u32"] + fn reduce_min_u32x2(x: u32x2) -> u32; + #[link_name = "llvm.experimental.vector.reduce.smin.i64.v2i64"] + fn reduce_min_i64x2(x: i64x2) -> i64; + #[link_name = "llvm.experimental.vector.reduce.umin.u64.v2u64"] + fn reduce_min_u64x2(x: u64x2) -> u64; + #[link_name = "llvm.experimental.vector.reduce.smin.i8.v4i8"] + fn reduce_min_i8x4(x: i8x4) -> i8; + #[link_name = "llvm.experimental.vector.reduce.umin.u8.v4u8"] + fn reduce_min_u8x4(x: u8x4) -> u8; + #[link_name = "llvm.experimental.vector.reduce.smin.i16.v4i16"] + fn reduce_min_i16x4(x: i16x4) -> i16; + #[link_name = "llvm.experimental.vector.reduce.umin.u16.v4u16"] + fn reduce_min_u16x4(x: u16x4) -> u16; + #[link_name = "llvm.experimental.vector.reduce.smin.i32.v4i32"] + fn reduce_min_i32x4(x: i32x4) -> i32; + #[link_name = "llvm.experimental.vector.reduce.umin.u32.v4u32"] + fn reduce_min_u32x4(x: u32x4) -> u32; + #[link_name = "llvm.experimental.vector.reduce.smin.i64.v4i64"] + fn reduce_min_i64x4(x: i64x4) -> i64; + #[link_name = "llvm.experimental.vector.reduce.umin.u64.v4u64"] + fn reduce_min_u64x4(x: u64x4) -> u64; + #[link_name = "llvm.experimental.vector.reduce.smin.i8.v8i8"] + fn reduce_min_i8x8(x: i8x8) -> i8; + #[link_name = "llvm.experimental.vector.reduce.umin.u8.v8u8"] + fn reduce_min_u8x8(x: u8x8) -> u8; + #[link_name = "llvm.experimental.vector.reduce.smin.i16.v8i16"] + fn reduce_min_i16x8(x: i16x8) -> i16; + #[link_name = "llvm.experimental.vector.reduce.umin.u16.v8u16"] + fn reduce_min_u16x8(x: u16x8) -> u16; + #[link_name = "llvm.experimental.vector.reduce.smin.i32.v8i32"] + fn reduce_min_i32x8(x: i32x8) -> i32; + #[link_name = "llvm.experimental.vector.reduce.umin.u32.v8u32"] + fn reduce_min_u32x8(x: u32x8) -> u32; + #[link_name = "llvm.experimental.vector.reduce.smin.i64.v8i64"] + fn reduce_min_i64x8(x: i64x8) -> i64; + #[link_name = "llvm.experimental.vector.reduce.umin.u64.v8u64"] + fn reduce_min_u64x8(x: u64x8) -> u64; + #[link_name = "llvm.experimental.vector.reduce.smin.i8.v16i8"] + fn reduce_min_i8x16(x: i8x16) -> i8; + #[link_name = "llvm.experimental.vector.reduce.umin.u8.v16u8"] + fn reduce_min_u8x16(x: u8x16) -> u8; + #[link_name = "llvm.experimental.vector.reduce.smin.i16.v16i16"] + fn reduce_min_i16x16(x: i16x16) -> i16; + #[link_name = "llvm.experimental.vector.reduce.umin.u16.v16u16"] + fn reduce_min_u16x16(x: u16x16) -> u16; + #[link_name = "llvm.experimental.vector.reduce.smin.i32.v16i32"] + fn reduce_min_i32x16(x: i32x16) -> i32; + #[link_name = "llvm.experimental.vector.reduce.umin.u32.v16u32"] + fn reduce_min_u32x16(x: u32x16) -> u32; + #[link_name = "llvm.experimental.vector.reduce.smin.i8.v32i8"] + fn reduce_min_i8x32(x: i8x32) -> i8; + #[link_name = "llvm.experimental.vector.reduce.umin.u8.v32u8"] + fn reduce_min_u8x32(x: u8x32) -> u8; + #[link_name = "llvm.experimental.vector.reduce.smin.i16.v32i16"] + fn reduce_min_i16x32(x: i16x32) -> i16; + #[link_name = "llvm.experimental.vector.reduce.umin.u16.v32u16"] + fn reduce_min_u16x32(x: u16x32) -> u16; + #[link_name = "llvm.experimental.vector.reduce.smin.i8.v64i8"] + fn reduce_min_i8x64(x: i8x64) -> i8; + #[link_name = "llvm.experimental.vector.reduce.umin.u8.v64u8"] + fn reduce_min_u8x64(x: u8x64) -> u8; + #[link_name = "llvm.experimental.vector.reduce.fmin.f32.v2f32"] + fn reduce_fmin_f32x2(x: f32x2) -> f32; + #[link_name = "llvm.experimental.vector.reduce.fmin.f64.v2f64"] + fn reduce_fmin_f64x2(x: f64x2) -> f64; + #[link_name = "llvm.experimental.vector.reduce.fmin.f32.v4f32"] + fn reduce_fmin_f32x4(x: f32x4) -> f32; + #[link_name = "llvm.experimental.vector.reduce.fmin.f64.v4f64"] + fn reduce_fmin_f64x4(x: f64x4) -> f64; + #[link_name = "llvm.experimental.vector.reduce.fmin.f32.v8f32"] + fn reduce_fmin_f32x8(x: f32x8) -> f32; + #[link_name = "llvm.experimental.vector.reduce.fmin.f64.v8f64"] + fn reduce_fmin_f64x8(x: f64x8) -> f64; + #[link_name = "llvm.experimental.vector.reduce.fmin.f32.v16f32"] + fn reduce_fmin_f32x16(x: f32x16) -> f32; +} + +/// Reduction: horizontal max of the vector elements. +#[cfg_attr(feature = "cargo-clippy", allow(stutter))] +pub trait ReduceMin { + /// Result type of the reduction. + type Acc; + /// Computes the horizontal max of the vector elements. + fn reduce_min(self) -> Self::Acc; +} + +macro_rules! red_min { + ($id:ident, $elem_ty:ident, $llvm_intr:ident) => { + impl ReduceMin for $id { + type Acc = $elem_ty; + #[cfg(not(target_arch = "aarch64"))] + #[inline] + fn reduce_min(self) -> Self::Acc { + unsafe { $llvm_intr(self) } + } + // FIXME: broken on AArch64 + #[cfg(target_arch = "aarch64")] + #[allow(unused_imports)] + #[inline] + fn reduce_min(self) -> Self::Acc { + use num::Float; + use cmp::Ord; + let mut x = self.extract(0); + for i in 1..$id::lanes() { + x = x.min(self.extract(i)); + } + x + } + } + }; +} +red_min!(i8x2, i8, reduce_min_i8x2); +red_min!(u8x2, u8, reduce_min_u8x2); +red_min!(i16x2, i16, reduce_min_i16x2); +red_min!(u16x2, u16, reduce_min_u16x2); +red_min!(i32x2, i32, reduce_min_i32x2); +red_min!(u32x2, u32, reduce_min_u32x2); +red_min!(i64x2, i64, reduce_min_i64x2); +red_min!(u64x2, u64, reduce_min_u64x2); +red_min!(i8x4, i8, reduce_min_i8x4); +red_min!(u8x4, u8, reduce_min_u8x4); +red_min!(i16x4, i16, reduce_min_i16x4); +red_min!(u16x4, u16, reduce_min_u16x4); +red_min!(i32x4, i32, reduce_min_i32x4); +red_min!(u32x4, u32, reduce_min_u32x4); +red_min!(i64x4, i64, reduce_min_i64x4); +red_min!(u64x4, u64, reduce_min_u64x4); +red_min!(i8x8, i8, reduce_min_i8x8); +red_min!(u8x8, u8, reduce_min_u8x8); +red_min!(i16x8, i16, reduce_min_i16x8); +red_min!(u16x8, u16, reduce_min_u16x8); +red_min!(i32x8, i32, reduce_min_i32x8); +red_min!(u32x8, u32, reduce_min_u32x8); +red_min!(i64x8, i64, reduce_min_i64x8); +red_min!(u64x8, u64, reduce_min_u64x8); +red_min!(i8x16, i8, reduce_min_i8x16); +red_min!(u8x16, u8, reduce_min_u8x16); +red_min!(i16x16, i16, reduce_min_i16x16); +red_min!(u16x16, u16, reduce_min_u16x16); +red_min!(i32x16, i32, reduce_min_i32x16); +red_min!(u32x16, u32, reduce_min_u32x16); +red_min!(i8x32, i8, reduce_min_i8x32); +red_min!(u8x32, u8, reduce_min_u8x32); +red_min!(i16x32, i16, reduce_min_i16x32); +red_min!(u16x32, u16, reduce_min_u16x32); +red_min!(i8x64, i8, reduce_min_i8x64); +red_min!(u8x64, u8, reduce_min_u8x64); + +red_min!(f32x2, f32, reduce_fmin_f32x2); +red_min!(f64x2, f64, reduce_fmin_f64x2); +red_min!(f32x4, f32, reduce_fmin_f32x4); +red_min!(f64x4, f64, reduce_fmin_f64x4); +red_min!(f32x8, f32, reduce_fmin_f32x8); +red_min!(f64x8, f64, reduce_fmin_f64x8); +red_min!(f32x16, f32, reduce_fmin_f32x16); + +#[cfg(test)] +mod tests { + use super::ReduceMin; + use ::coresimd::simd::*; + + // note: these are tested in the portable vector API tests + + #[test] + fn reduce_min_i32x4() { + let v = i32x4::new(1, 2, -1, 3); + assert_eq!(v.reduce_min(), -1_i32); + } + #[test] + fn reduce_min_u32x4() { + let v = u32x4::new(4, 2, 7, 3); + assert_eq!(v.reduce_min(), 2_u32); + } + #[test] + fn reduce_min_f32x4() { + let v = f32x4::new(4., 2., -1., 3.); + assert_eq!(v.reduce_min(), -1.); + } +} diff --git a/coresimd/ppsv/codegen/mod.rs b/coresimd/ppsv/codegen/mod.rs new file mode 100644 index 0000000000..26beb14563 --- /dev/null +++ b/coresimd/ppsv/codegen/mod.rs @@ -0,0 +1,9 @@ +//! Code Generation + +pub mod sum; +pub mod product; +pub mod and; +pub mod or; +pub mod xor; +pub mod min; +pub mod max; diff --git a/coresimd/ppsv/codegen/or.rs b/coresimd/ppsv/codegen/or.rs new file mode 100644 index 0000000000..007d8f9e11 --- /dev/null +++ b/coresimd/ppsv/codegen/or.rs @@ -0,0 +1,170 @@ +//! Code generation for the or reduction. +use ::coresimd::simd::*; + +/// LLVM intrinsics used in the or reduction +#[allow(improper_ctypes)] +extern "C" { + #[link_name = "llvm.experimental.vector.reduce.or.i8.v2i8"] + fn reduce_or_i8x2(x: i8x2) -> i8; + #[link_name = "llvm.experimental.vector.reduce.or.u8.v2u8"] + fn reduce_or_u8x2(x: u8x2) -> u8; + #[link_name = "llvm.experimental.vector.reduce.or.i16.v2i16"] + fn reduce_or_i16x2(x: i16x2) -> i16; + #[link_name = "llvm.experimental.vector.reduce.or.u16.v2u16"] + fn reduce_or_u16x2(x: u16x2) -> u16; + #[link_name = "llvm.experimental.vector.reduce.or.i32.v2i32"] + fn reduce_or_i32x2(x: i32x2) -> i32; + #[link_name = "llvm.experimental.vector.reduce.or.u32.v2u32"] + fn reduce_or_u32x2(x: u32x2) -> u32; + #[link_name = "llvm.experimental.vector.reduce.or.i64.v2i64"] + fn reduce_or_i64x2(x: i64x2) -> i64; + #[link_name = "llvm.experimental.vector.reduce.or.u64.v2u64"] + fn reduce_or_u64x2(x: u64x2) -> u64; + #[link_name = "llvm.experimental.vector.reduce.or.i8.v4i8"] + fn reduce_or_i8x4(x: i8x4) -> i8; + #[link_name = "llvm.experimental.vector.reduce.or.u8.v4u8"] + fn reduce_or_u8x4(x: u8x4) -> u8; + #[link_name = "llvm.experimental.vector.reduce.or.i16.v4i16"] + fn reduce_or_i16x4(x: i16x4) -> i16; + #[link_name = "llvm.experimental.vector.reduce.or.u16.v4u16"] + fn reduce_or_u16x4(x: u16x4) -> u16; + #[link_name = "llvm.experimental.vector.reduce.or.i32.v4i32"] + fn reduce_or_i32x4(x: i32x4) -> i32; + #[link_name = "llvm.experimental.vector.reduce.or.u32.v4u32"] + fn reduce_or_u32x4(x: u32x4) -> u32; + #[link_name = "llvm.experimental.vector.reduce.or.i64.v4i64"] + fn reduce_or_i64x4(x: i64x4) -> i64; + #[link_name = "llvm.experimental.vector.reduce.or.u64.v4u64"] + fn reduce_or_u64x4(x: u64x4) -> u64; + #[link_name = "llvm.experimental.vector.reduce.or.i8.v8i8"] + fn reduce_or_i8x8(x: i8x8) -> i8; + #[link_name = "llvm.experimental.vector.reduce.or.u8.v8u8"] + fn reduce_or_u8x8(x: u8x8) -> u8; + #[link_name = "llvm.experimental.vector.reduce.or.i16.v8i16"] + fn reduce_or_i16x8(x: i16x8) -> i16; + #[link_name = "llvm.experimental.vector.reduce.or.u16.v8u16"] + fn reduce_or_u16x8(x: u16x8) -> u16; + #[link_name = "llvm.experimental.vector.reduce.or.i32.v8i32"] + fn reduce_or_i32x8(x: i32x8) -> i32; + #[link_name = "llvm.experimental.vector.reduce.or.u32.v8u32"] + fn reduce_or_u32x8(x: u32x8) -> u32; + #[link_name = "llvm.experimental.vector.reduce.or.i64.v8i64"] + fn reduce_or_i64x8(x: i64x8) -> i64; + #[link_name = "llvm.experimental.vector.reduce.or.u64.v8u64"] + fn reduce_or_u64x8(x: u64x8) -> u64; + #[link_name = "llvm.experimental.vector.reduce.or.i8.v16i8"] + fn reduce_or_i8x16(x: i8x16) -> i8; + #[link_name = "llvm.experimental.vector.reduce.or.u8.v16u8"] + fn reduce_or_u8x16(x: u8x16) -> u8; + #[link_name = "llvm.experimental.vector.reduce.or.i16.v16i16"] + fn reduce_or_i16x16(x: i16x16) -> i16; + #[link_name = "llvm.experimental.vector.reduce.or.u16.v16u16"] + fn reduce_or_u16x16(x: u16x16) -> u16; + #[link_name = "llvm.experimental.vector.reduce.or.i32.v16i32"] + fn reduce_or_i32x16(x: i32x16) -> i32; + #[link_name = "llvm.experimental.vector.reduce.or.u32.v16u32"] + fn reduce_or_u32x16(x: u32x16) -> u32; + #[link_name = "llvm.experimental.vector.reduce.or.i8.v32i8"] + fn reduce_or_i8x32(x: i8x32) -> i8; + #[link_name = "llvm.experimental.vector.reduce.or.u8.v32u8"] + fn reduce_or_u8x32(x: u8x32) -> u8; + #[link_name = "llvm.experimental.vector.reduce.or.i16.v32i16"] + fn reduce_or_i16x32(x: i16x32) -> i16; + #[link_name = "llvm.experimental.vector.reduce.or.u16.v32u16"] + fn reduce_or_u16x32(x: u16x32) -> u16; + #[link_name = "llvm.experimental.vector.reduce.or.i8.v64i8"] + fn reduce_or_i8x64(x: i8x64) -> i8; + #[link_name = "llvm.experimental.vector.reduce.or.u8.v64u8"] + fn reduce_or_u8x64(x: u8x64) -> u8; +} + +/// Reduction: horizontal bitwise or of the vector elements. +#[cfg_attr(feature = "cargo-clippy", allow(stutter))] +pub trait ReduceOr { + /// Result of the reduction. + type Acc; + /// Computes the horizontal bitwise or of the vector elements. + fn reduce_or(self) -> Self::Acc; +} + +macro_rules! red_or { + ($id:ident, $elem_ty:ident, $llvm_intr:ident) => { + impl ReduceOr for $id { + type Acc = $elem_ty; + #[cfg(not(target_arch = "aarch64"))] + #[inline] + fn reduce_or(self) -> Self::Acc { + unsafe { $llvm_intr(self.into_bits()) } + } + // FIXME: broken in AArch64 + #[cfg(target_arch = "aarch64")] + #[inline] + fn reduce_or(self) -> Self::Acc { + let mut x = self.extract(0) as Self::Acc; + for i in 1..$id::lanes() { + x |= self.extract(i) as Self::Acc; + } + x + } + } + }; +} +red_or!(i8x2, i8, reduce_or_i8x2); +red_or!(u8x2, u8, reduce_or_u8x2); +red_or!(i16x2, i16, reduce_or_i16x2); +red_or!(u16x2, u16, reduce_or_u16x2); +red_or!(i32x2, i32, reduce_or_i32x2); +red_or!(u32x2, u32, reduce_or_u32x2); +red_or!(i64x2, i64, reduce_or_i64x2); +red_or!(u64x2, u64, reduce_or_u64x2); +red_or!(i8x4, i8, reduce_or_i8x4); +red_or!(u8x4, u8, reduce_or_u8x4); +red_or!(i16x4, i16, reduce_or_i16x4); +red_or!(u16x4, u16, reduce_or_u16x4); +red_or!(i32x4, i32, reduce_or_i32x4); +red_or!(u32x4, u32, reduce_or_u32x4); +red_or!(i64x4, i64, reduce_or_i64x4); +red_or!(u64x4, u64, reduce_or_u64x4); +red_or!(i8x8, i8, reduce_or_i8x8); +red_or!(u8x8, u8, reduce_or_u8x8); +red_or!(i16x8, i16, reduce_or_i16x8); +red_or!(u16x8, u16, reduce_or_u16x8); +red_or!(i32x8, i32, reduce_or_i32x8); +red_or!(u32x8, u32, reduce_or_u32x8); +red_or!(i64x8, i64, reduce_or_i64x8); +red_or!(u64x8, u64, reduce_or_u64x8); +red_or!(i8x16, i8, reduce_or_i8x16); +red_or!(u8x16, u8, reduce_or_u8x16); +red_or!(i16x16, i16, reduce_or_i16x16); +red_or!(u16x16, u16, reduce_or_u16x16); +red_or!(i32x16, i32, reduce_or_i32x16); +red_or!(u32x16, u32, reduce_or_u32x16); +red_or!(i8x32, i8, reduce_or_i8x32); +red_or!(u8x32, u8, reduce_or_u8x32); +red_or!(i16x32, i16, reduce_or_i16x32); +red_or!(u16x32, u16, reduce_or_u16x32); +red_or!(i8x64, i8, reduce_or_i8x64); +red_or!(u8x64, u8, reduce_or_u8x64); + +red_or!(b8x2, i8, reduce_or_i8x2); +red_or!(b8x4, i8, reduce_or_i8x4); +red_or!(b8x8, i8, reduce_or_i8x8); +red_or!(b8x16, i8, reduce_or_i8x16); +red_or!(b8x32, i8, reduce_or_i8x32); +red_or!(b8x64, i8, reduce_or_i8x64); + +#[cfg(test)] +mod tests { + use super::ReduceOr; + use ::coresimd::simd::*; + + // note: these are tested in the portable vector API tests + + #[test] + fn reduce_or_i32x4() { + let v = i32x4::splat(1); + assert_eq!(v.reduce_or(), 1_i32); + let v = i32x4::new(1, 1, 0, 1); + assert_eq!(v.reduce_or(), 1_i32); + } +} diff --git a/coresimd/ppsv/codegen/product.rs b/coresimd/ppsv/codegen/product.rs new file mode 100644 index 0000000000..725ee884c5 --- /dev/null +++ b/coresimd/ppsv/codegen/product.rs @@ -0,0 +1,210 @@ +//! Code generation for the product reduction. +use ::coresimd::simd::*; + +/// LLVM intrinsics used in the product reduction +#[allow(improper_ctypes)] +extern "C" { + #[link_name = "llvm.experimental.vector.reduce.mul.i8.v2i8"] + fn reduce_mul_i8x2(x: i8x2) -> i8; + #[link_name = "llvm.experimental.vector.reduce.mul.u8.v2u8"] + fn reduce_mul_u8x2(x: u8x2) -> u8; + #[link_name = "llvm.experimental.vector.reduce.mul.i16.v2i16"] + fn reduce_mul_i16x2(x: i16x2) -> i16; + #[link_name = "llvm.experimental.vector.reduce.mul.u16.v2u16"] + fn reduce_mul_u16x2(x: u16x2) -> u16; + #[link_name = "llvm.experimental.vector.reduce.mul.i32.v2i32"] + fn reduce_mul_i32x2(x: i32x2) -> i32; + #[link_name = "llvm.experimental.vector.reduce.mul.u32.v2u32"] + fn reduce_mul_u32x2(x: u32x2) -> u32; + #[link_name = "llvm.experimental.vector.reduce.mul.i64.v2i64"] + fn reduce_mul_i64x2(x: i64x2) -> i64; + #[link_name = "llvm.experimental.vector.reduce.mul.u64.v2u64"] + fn reduce_mul_u64x2(x: u64x2) -> u64; + #[link_name = "llvm.experimental.vector.reduce.mul.i8.v4i8"] + fn reduce_mul_i8x4(x: i8x4) -> i8; + #[link_name = "llvm.experimental.vector.reduce.mul.u8.v4u8"] + fn reduce_mul_u8x4(x: u8x4) -> u8; + #[link_name = "llvm.experimental.vector.reduce.mul.i16.v4i16"] + fn reduce_mul_i16x4(x: i16x4) -> i16; + #[link_name = "llvm.experimental.vector.reduce.mul.u16.v4u16"] + fn reduce_mul_u16x4(x: u16x4) -> u16; + #[link_name = "llvm.experimental.vector.reduce.mul.i32.v4i32"] + fn reduce_mul_i32x4(x: i32x4) -> i32; + #[link_name = "llvm.experimental.vector.reduce.mul.u32.v4u32"] + fn reduce_mul_u32x4(x: u32x4) -> u32; + #[link_name = "llvm.experimental.vector.reduce.mul.i64.v4i64"] + fn reduce_mul_i64x4(x: i64x4) -> i64; + #[link_name = "llvm.experimental.vector.reduce.mul.u64.v4u64"] + fn reduce_mul_u64x4(x: u64x4) -> u64; + #[link_name = "llvm.experimental.vector.reduce.mul.i8.v8i8"] + fn reduce_mul_i8x8(x: i8x8) -> i8; + #[link_name = "llvm.experimental.vector.reduce.mul.u8.v8u8"] + fn reduce_mul_u8x8(x: u8x8) -> u8; + #[link_name = "llvm.experimental.vector.reduce.mul.i16.v8i16"] + fn reduce_mul_i16x8(x: i16x8) -> i16; + #[link_name = "llvm.experimental.vector.reduce.mul.u16.v8u16"] + fn reduce_mul_u16x8(x: u16x8) -> u16; + #[link_name = "llvm.experimental.vector.reduce.mul.i32.v8i32"] + fn reduce_mul_i32x8(x: i32x8) -> i32; + #[link_name = "llvm.experimental.vector.reduce.mul.u32.v8u32"] + fn reduce_mul_u32x8(x: u32x8) -> u32; + #[link_name = "llvm.experimental.vector.reduce.mul.i64.v8i64"] + fn reduce_mul_i64x8(x: i64x8) -> i64; + #[link_name = "llvm.experimental.vector.reduce.mul.u64.v8u64"] + fn reduce_mul_u64x8(x: u64x8) -> u64; + #[link_name = "llvm.experimental.vector.reduce.mul.i8.v16i8"] + fn reduce_mul_i8x16(x: i8x16) -> i8; + #[link_name = "llvm.experimental.vector.reduce.mul.u8.v16u8"] + fn reduce_mul_u8x16(x: u8x16) -> u8; + #[link_name = "llvm.experimental.vector.reduce.mul.i16.v16i16"] + fn reduce_mul_i16x16(x: i16x16) -> i16; + #[link_name = "llvm.experimental.vector.reduce.mul.u16.v16u16"] + fn reduce_mul_u16x16(x: u16x16) -> u16; + #[link_name = "llvm.experimental.vector.reduce.mul.i32.v16i32"] + fn reduce_mul_i32x16(x: i32x16) -> i32; + #[link_name = "llvm.experimental.vector.reduce.mul.u32.v16u32"] + fn reduce_mul_u32x16(x: u32x16) -> u32; + #[link_name = "llvm.experimental.vector.reduce.mul.i8.v32i8"] + fn reduce_mul_i8x32(x: i8x32) -> i8; + #[link_name = "llvm.experimental.vector.reduce.mul.u8.v32u8"] + fn reduce_mul_u8x32(x: u8x32) -> u8; + #[link_name = "llvm.experimental.vector.reduce.mul.i16.v32i16"] + fn reduce_mul_i16x32(x: i16x32) -> i16; + #[link_name = "llvm.experimental.vector.reduce.mul.u16.v32u16"] + fn reduce_mul_u16x32(x: u16x32) -> u16; + #[link_name = "llvm.experimental.vector.reduce.mul.i8.v64i8"] + fn reduce_mul_i8x64(x: i8x64) -> i8; + #[link_name = "llvm.experimental.vector.reduce.mul.u8.v64u8"] + fn reduce_mul_u8x64(x: u8x64) -> u8; + #[link_name = "llvm.experimental.vector.reduce.fmul.f32.v2f32"] + fn reduce_fmul_f32x2(acc: f32, x: f32x2) -> f32; + #[link_name = "llvm.experimental.vector.reduce.fmul.f64.v2f64"] + fn reduce_fmul_f64x2(acc: f64, x: f64x2) -> f64; + #[link_name = "llvm.experimental.vector.reduce.fmul.f32.v4f32"] + fn reduce_fmul_f32x4(acc: f32, x: f32x4) -> f32; + #[link_name = "llvm.experimental.vector.reduce.fmul.f64.v4f64"] + fn reduce_fmul_f64x4(acc: f64, x: f64x4) -> f64; + #[link_name = "llvm.experimental.vector.reduce.fmul.f32.v8f32"] + fn reduce_fmul_f32x8(acc: f32, x: f32x8) -> f32; + #[link_name = "llvm.experimental.vector.reduce.fmul.f64.v8f64"] + fn reduce_fmul_f64x8(acc: f64, x: f64x8) -> f64; + #[link_name = "llvm.experimental.vector.reduce.fmul.f32.v16f32"] + fn reduce_fmul_f32x16(acc: f32, x: f32x16) -> f32; +} + +/// Reduction: horizontal product of the vector elements. +pub trait ReduceMul { + /// Result type of the reduction. + type Acc; + /// Computes the horizontal product of the vector elements. + fn reduce_mul(self) -> Self::Acc; +} + +macro_rules! red_mul { + ($id:ident, $elem_ty:ident, $llvm_intr:ident) => { + impl ReduceMul for $id { + type Acc = $elem_ty; + #[cfg(not(target_arch = "aarch64"))] + #[inline] + fn reduce_mul(self) -> Self::Acc { + unsafe { $llvm_intr(self) } + } + // FIXME: broken in AArch64 + #[cfg(target_arch = "aarch64")] + #[inline] + fn reduce_mul(self) -> Self::Acc { + let mut x = self.extract(0); + for i in 1..$id::lanes() { + x *= self.extract(i); + } + x + } + } + }; +} +red_mul!(i8x2, i8, reduce_mul_i8x2); +red_mul!(u8x2, u8, reduce_mul_u8x2); +red_mul!(i16x2, i16, reduce_mul_i16x2); +red_mul!(u16x2, u16, reduce_mul_u16x2); +red_mul!(i32x2, i32, reduce_mul_i32x2); +red_mul!(u32x2, u32, reduce_mul_u32x2); +red_mul!(i64x2, i64, reduce_mul_i64x2); +red_mul!(u64x2, u64, reduce_mul_u64x2); +red_mul!(i8x4, i8, reduce_mul_i8x4); +red_mul!(u8x4, u8, reduce_mul_u8x4); +red_mul!(i16x4, i16, reduce_mul_i16x4); +red_mul!(u16x4, u16, reduce_mul_u16x4); +red_mul!(i32x4, i32, reduce_mul_i32x4); +red_mul!(u32x4, u32, reduce_mul_u32x4); +red_mul!(i64x4, i64, reduce_mul_i64x4); +red_mul!(u64x4, u64, reduce_mul_u64x4); +red_mul!(i8x8, i8, reduce_mul_i8x8); +red_mul!(u8x8, u8, reduce_mul_u8x8); +red_mul!(i16x8, i16, reduce_mul_i16x8); +red_mul!(u16x8, u16, reduce_mul_u16x8); +red_mul!(i32x8, i32, reduce_mul_i32x8); +red_mul!(u32x8, u32, reduce_mul_u32x8); +red_mul!(i64x8, i64, reduce_mul_i64x8); +red_mul!(u64x8, u64, reduce_mul_u64x8); +red_mul!(i8x16, i8, reduce_mul_i8x16); +red_mul!(u8x16, u8, reduce_mul_u8x16); +red_mul!(i16x16, i16, reduce_mul_i16x16); +red_mul!(u16x16, u16, reduce_mul_u16x16); +red_mul!(i32x16, i32, reduce_mul_i32x16); +red_mul!(u32x16, u32, reduce_mul_u32x16); +red_mul!(i8x32, i8, reduce_mul_i8x32); +red_mul!(u8x32, u8, reduce_mul_u8x32); +red_mul!(i16x32, i16, reduce_mul_i16x32); +red_mul!(u16x32, u16, reduce_mul_u16x32); +red_mul!(i8x64, i8, reduce_mul_i8x64); +red_mul!(u8x64, u8, reduce_mul_u8x64); + +macro_rules! red_fmul { + ($id:ident, $elem_ty:ident, $llvm_intr:ident) => { + impl ReduceMul for $id { + type Acc = $elem_ty; + #[inline] + fn reduce_mul(self) -> Self::Acc { + // FIXME: + // unsafe { $llvm_intr(1. as $elem_ty, self) } + let mut x = self.extract(0); + for i in 1..$id::lanes() { + x *= self.extract(i); + } + x + } + } + }; +} + +red_fmul!(f32x2, f32, reduce_fmul_f32x2); +red_fmul!(f64x2, f64, reduce_fmul_f64x2); +red_fmul!(f32x4, f32, reduce_fmul_f32x4); +red_fmul!(f64x4, f64, reduce_fmul_f64x4); +red_fmul!(f32x8, f32, reduce_fmul_f32x8); +red_fmul!(f64x8, f64, reduce_fmul_f64x8); +red_fmul!(f32x16, f32, reduce_fmul_f32x16); + +#[cfg(test)] +mod tests { + use super::ReduceMul; + use ::coresimd::simd::*; + + // note: these are tested in the portable vector API tests + + #[test] + fn reduce_mul_i32x4() { + let v = i32x4::splat(2); + assert_eq!(v.reduce_mul(), 16_i32); + } + #[test] + fn reduce_mul_u32x4() { + let v = u32x4::splat(2); + assert_eq!(v.reduce_mul(), 16_u32); + } + #[test] + fn reduce_mul_f32x4() { + let v = f32x4::splat(2.); + assert_eq!(v.reduce_mul(), 16.); + } +} diff --git a/coresimd/ppsv/codegen/sum.rs b/coresimd/ppsv/codegen/sum.rs new file mode 100644 index 0000000000..79b1ba0849 --- /dev/null +++ b/coresimd/ppsv/codegen/sum.rs @@ -0,0 +1,210 @@ +//! Code generation for the sum reduction. +use ::coresimd::simd::*; + +/// LLVM intrinsics used in the sum reduction +#[allow(improper_ctypes)] +extern "C" { + #[link_name = "llvm.experimental.vector.reduce.add.i8.v2i8"] + fn reduce_add_i8x2(x: i8x2) -> i8; + #[link_name = "llvm.experimental.vector.reduce.add.u8.v2u8"] + fn reduce_add_u8x2(x: u8x2) -> u8; + #[link_name = "llvm.experimental.vector.reduce.add.i16.v2i16"] + fn reduce_add_i16x2(x: i16x2) -> i16; + #[link_name = "llvm.experimental.vector.reduce.add.u16.v2u16"] + fn reduce_add_u16x2(x: u16x2) -> u16; + #[link_name = "llvm.experimental.vector.reduce.add.i32.v2i32"] + fn reduce_add_i32x2(x: i32x2) -> i32; + #[link_name = "llvm.experimental.vector.reduce.add.u32.v2u32"] + fn reduce_add_u32x2(x: u32x2) -> u32; + #[link_name = "llvm.experimental.vector.reduce.add.i64.v2i64"] + fn reduce_add_i64x2(x: i64x2) -> i64; + #[link_name = "llvm.experimental.vector.reduce.add.u64.v2u64"] + fn reduce_add_u64x2(x: u64x2) -> u64; + #[link_name = "llvm.experimental.vector.reduce.add.i8.v4i8"] + fn reduce_add_i8x4(x: i8x4) -> i8; + #[link_name = "llvm.experimental.vector.reduce.add.u8.v4u8"] + fn reduce_add_u8x4(x: u8x4) -> u8; + #[link_name = "llvm.experimental.vector.reduce.add.i16.v4i16"] + fn reduce_add_i16x4(x: i16x4) -> i16; + #[link_name = "llvm.experimental.vector.reduce.add.u16.v4u16"] + fn reduce_add_u16x4(x: u16x4) -> u16; + #[link_name = "llvm.experimental.vector.reduce.add.i32.v4i32"] + fn reduce_add_i32x4(x: i32x4) -> i32; + #[link_name = "llvm.experimental.vector.reduce.add.u32.v4u32"] + fn reduce_add_u32x4(x: u32x4) -> u32; + #[link_name = "llvm.experimental.vector.reduce.add.i64.v4i64"] + fn reduce_add_i64x4(x: i64x4) -> i64; + #[link_name = "llvm.experimental.vector.reduce.add.u64.v4u64"] + fn reduce_add_u64x4(x: u64x4) -> u64; + #[link_name = "llvm.experimental.vector.reduce.add.i8.v8i8"] + fn reduce_add_i8x8(x: i8x8) -> i8; + #[link_name = "llvm.experimental.vector.reduce.add.u8.v8u8"] + fn reduce_add_u8x8(x: u8x8) -> u8; + #[link_name = "llvm.experimental.vector.reduce.add.i16.v8i16"] + fn reduce_add_i16x8(x: i16x8) -> i16; + #[link_name = "llvm.experimental.vector.reduce.add.u16.v8u16"] + fn reduce_add_u16x8(x: u16x8) -> u16; + #[link_name = "llvm.experimental.vector.reduce.add.i32.v8i32"] + fn reduce_add_i32x8(x: i32x8) -> i32; + #[link_name = "llvm.experimental.vector.reduce.add.u32.v8u32"] + fn reduce_add_u32x8(x: u32x8) -> u32; + #[link_name = "llvm.experimental.vector.reduce.add.i64.v8i64"] + fn reduce_add_i64x8(x: i64x8) -> i64; + #[link_name = "llvm.experimental.vector.reduce.add.u64.v8u64"] + fn reduce_add_u64x8(x: u64x8) -> u64; + #[link_name = "llvm.experimental.vector.reduce.add.i8.v16i8"] + fn reduce_add_i8x16(x: i8x16) -> i8; + #[link_name = "llvm.experimental.vector.reduce.add.u8.v16u8"] + fn reduce_add_u8x16(x: u8x16) -> u8; + #[link_name = "llvm.experimental.vector.reduce.add.i16.v16i16"] + fn reduce_add_i16x16(x: i16x16) -> i16; + #[link_name = "llvm.experimental.vector.reduce.add.u16.v16u16"] + fn reduce_add_u16x16(x: u16x16) -> u16; + #[link_name = "llvm.experimental.vector.reduce.add.i32.v16i32"] + fn reduce_add_i32x16(x: i32x16) -> i32; + #[link_name = "llvm.experimental.vector.reduce.add.u32.v16u32"] + fn reduce_add_u32x16(x: u32x16) -> u32; + #[link_name = "llvm.experimental.vector.reduce.add.i8.v32i8"] + fn reduce_add_i8x32(x: i8x32) -> i8; + #[link_name = "llvm.experimental.vector.reduce.add.u8.v32u8"] + fn reduce_add_u8x32(x: u8x32) -> u8; + #[link_name = "llvm.experimental.vector.reduce.add.i16.v32i16"] + fn reduce_add_i16x32(x: i16x32) -> i16; + #[link_name = "llvm.experimental.vector.reduce.add.u16.v32u16"] + fn reduce_add_u16x32(x: u16x32) -> u16; + #[link_name = "llvm.experimental.vector.reduce.add.i8.v64i8"] + fn reduce_add_i8x64(x: i8x64) -> i8; + #[link_name = "llvm.experimental.vector.reduce.add.u8.v64u8"] + fn reduce_add_u8x64(x: u8x64) -> u8; + #[link_name = "llvm.experimental.vector.reduce.fadd.f32.v2f32"] + fn reduce_fadd_f32x2(acc: f32, x: f32x2) -> f32; + #[link_name = "llvm.experimental.vector.reduce.fadd.f64.v2f64"] + fn reduce_fadd_f64x2(acc: f64, x: f64x2) -> f64; + #[link_name = "llvm.experimental.vector.reduce.fadd.f32.v4f32"] + fn reduce_fadd_f32x4(acc: f32, x: f32x4) -> f32; + #[link_name = "llvm.experimental.vector.reduce.fadd.f64.v4f64"] + fn reduce_fadd_f64x4(acc: f64, x: f64x4) -> f64; + #[link_name = "llvm.experimental.vector.reduce.fadd.f32.v8f32"] + fn reduce_fadd_f32x8(acc: f32, x: f32x8) -> f32; + #[link_name = "llvm.experimental.vector.reduce.fadd.f64.v8f64"] + fn reduce_fadd_f64x8(acc: f64, x: f64x8) -> f64; + #[link_name = "llvm.experimental.vector.reduce.fadd.f32.v16f32"] + fn reduce_fadd_f32x16(acc: f32, x: f32x16) -> f32; +} + +/// Reduction: horizontal sum of the vector elements. +pub trait ReduceAdd { + /// Result type of the reduction. + type Acc; + /// Computes the horizontal sum of the vector elements. + fn reduce_add(self) -> Self::Acc; +} + +macro_rules! red_add { + ($id:ident, $elem_ty:ident, $llvm_intr:ident) => { + impl ReduceAdd for $id { + type Acc = $elem_ty; + #[cfg(not(target_arch = "aarch64"))] + #[inline] + fn reduce_add(self) -> Self::Acc { + unsafe { $llvm_intr(self) } + } + // FIXME: broken in AArch64 + #[cfg(target_arch = "aarch64")] + #[inline] + fn reduce_add(self) -> Self::Acc { + let mut x = self.extract(0) as Self::Acc; + for i in 1..$id::lanes() { + x += self.extract(i) as Self::Acc; + } + x + } + } + }; +} +red_add!(i8x2, i8, reduce_add_i8x2); +red_add!(u8x2, u8, reduce_add_u8x2); +red_add!(i16x2, i16, reduce_add_i16x2); +red_add!(u16x2, u16, reduce_add_u16x2); +red_add!(i32x2, i32, reduce_add_i32x2); +red_add!(u32x2, u32, reduce_add_u32x2); +red_add!(i64x2, i64, reduce_add_i64x2); +red_add!(u64x2, u64, reduce_add_u64x2); +red_add!(i8x4, i8, reduce_add_i8x4); +red_add!(u8x4, u8, reduce_add_u8x4); +red_add!(i16x4, i16, reduce_add_i16x4); +red_add!(u16x4, u16, reduce_add_u16x4); +red_add!(i32x4, i32, reduce_add_i32x4); +red_add!(u32x4, u32, reduce_add_u32x4); +red_add!(i64x4, i64, reduce_add_i64x4); +red_add!(u64x4, u64, reduce_add_u64x4); +red_add!(i8x8, i8, reduce_add_i8x8); +red_add!(u8x8, u8, reduce_add_u8x8); +red_add!(i16x8, i16, reduce_add_i16x8); +red_add!(u16x8, u16, reduce_add_u16x8); +red_add!(i32x8, i32, reduce_add_i32x8); +red_add!(u32x8, u32, reduce_add_u32x8); +red_add!(i64x8, i64, reduce_add_i64x8); +red_add!(u64x8, u64, reduce_add_u64x8); +red_add!(i8x16, i8, reduce_add_i8x16); +red_add!(u8x16, u8, reduce_add_u8x16); +red_add!(i16x16, i16, reduce_add_i16x16); +red_add!(u16x16, u16, reduce_add_u16x16); +red_add!(i32x16, i32, reduce_add_i32x16); +red_add!(u32x16, u32, reduce_add_u32x16); +red_add!(i8x32, i8, reduce_add_i8x32); +red_add!(u8x32, u8, reduce_add_u8x32); +red_add!(i16x32, i16, reduce_add_i16x32); +red_add!(u16x32, u16, reduce_add_u16x32); +red_add!(i8x64, i8, reduce_add_i8x64); +red_add!(u8x64, u8, reduce_add_u8x64); + +macro_rules! red_fadd { + ($id:ident, $elem_ty:ident, $llvm_intr:ident) => { + impl ReduceAdd for $id { + type Acc = $elem_ty; + #[inline] + fn reduce_add(self) -> Self::Acc { + // FIXME: + //unsafe { $llvm_intr(0. as $elem_ty, self) } + let mut x = self.extract(0); + for i in 1..$id::lanes() { + x += self.extract(i); + } + x + } + } + }; +} + +red_fadd!(f32x2, f32, reduce_fadd_f32x2); +red_fadd!(f64x2, f64, reduce_fadd_f64x2); +red_fadd!(f32x4, f32, reduce_fadd_f32x4); +red_fadd!(f64x4, f64, reduce_fadd_f64x4); +red_fadd!(f32x8, f32, reduce_fadd_f32x8); +red_fadd!(f64x8, f64, reduce_fadd_f64x8); +red_fadd!(f32x16, f32, reduce_fadd_f32x16); + +#[cfg(test)] +mod tests { + use super::ReduceAdd; + use ::coresimd::simd::*; + + // note: these are tested in the portable vector API tests + + #[test] + fn reduce_add_i32x4() { + let v = i32x4::splat(1); + assert_eq!(v.reduce_add(), 4_i32); + } + #[test] + fn reduce_add_u32x4() { + let v = u32x4::splat(1); + assert_eq!(v.reduce_add(), 4_u32); + } + #[test] + fn reduce_add_f32x4() { + let v = f32x4::splat(1.); + assert_eq!(v.reduce_add(), 4.); + } +} diff --git a/coresimd/ppsv/codegen/xor.rs b/coresimd/ppsv/codegen/xor.rs new file mode 100644 index 0000000000..cf5ef2bece --- /dev/null +++ b/coresimd/ppsv/codegen/xor.rs @@ -0,0 +1,170 @@ +//! Code generation for the xor reduction. +use ::coresimd::simd::*; + +/// LLVM intrinsics used in the xor reduction +#[allow(improper_ctypes)] +extern "C" { + #[link_name = "llvm.experimental.vector.reduce.xor.i8.v2i8"] + fn reduce_xor_i8x2(x: i8x2) -> i8; + #[link_name = "llvm.experimental.vector.reduce.xor.u8.v2u8"] + fn reduce_xor_u8x2(x: u8x2) -> u8; + #[link_name = "llvm.experimental.vector.reduce.xor.i16.v2i16"] + fn reduce_xor_i16x2(x: i16x2) -> i16; + #[link_name = "llvm.experimental.vector.reduce.xor.u16.v2u16"] + fn reduce_xor_u16x2(x: u16x2) -> u16; + #[link_name = "llvm.experimental.vector.reduce.xor.i32.v2i32"] + fn reduce_xor_i32x2(x: i32x2) -> i32; + #[link_name = "llvm.experimental.vector.reduce.xor.u32.v2u32"] + fn reduce_xor_u32x2(x: u32x2) -> u32; + #[link_name = "llvm.experimental.vector.reduce.xor.i64.v2i64"] + fn reduce_xor_i64x2(x: i64x2) -> i64; + #[link_name = "llvm.experimental.vector.reduce.xor.u64.v2u64"] + fn reduce_xor_u64x2(x: u64x2) -> u64; + #[link_name = "llvm.experimental.vector.reduce.xor.i8.v4i8"] + fn reduce_xor_i8x4(x: i8x4) -> i8; + #[link_name = "llvm.experimental.vector.reduce.xor.u8.v4u8"] + fn reduce_xor_u8x4(x: u8x4) -> u8; + #[link_name = "llvm.experimental.vector.reduce.xor.i16.v4i16"] + fn reduce_xor_i16x4(x: i16x4) -> i16; + #[link_name = "llvm.experimental.vector.reduce.xor.u16.v4u16"] + fn reduce_xor_u16x4(x: u16x4) -> u16; + #[link_name = "llvm.experimental.vector.reduce.xor.i32.v4i32"] + fn reduce_xor_i32x4(x: i32x4) -> i32; + #[link_name = "llvm.experimental.vector.reduce.xor.u32.v4u32"] + fn reduce_xor_u32x4(x: u32x4) -> u32; + #[link_name = "llvm.experimental.vector.reduce.xor.i64.v4i64"] + fn reduce_xor_i64x4(x: i64x4) -> i64; + #[link_name = "llvm.experimental.vector.reduce.xor.u64.v4u64"] + fn reduce_xor_u64x4(x: u64x4) -> u64; + #[link_name = "llvm.experimental.vector.reduce.xor.i8.v8i8"] + fn reduce_xor_i8x8(x: i8x8) -> i8; + #[link_name = "llvm.experimental.vector.reduce.xor.u8.v8u8"] + fn reduce_xor_u8x8(x: u8x8) -> u8; + #[link_name = "llvm.experimental.vector.reduce.xor.i16.v8i16"] + fn reduce_xor_i16x8(x: i16x8) -> i16; + #[link_name = "llvm.experimental.vector.reduce.xor.u16.v8u16"] + fn reduce_xor_u16x8(x: u16x8) -> u16; + #[link_name = "llvm.experimental.vector.reduce.xor.i32.v8i32"] + fn reduce_xor_i32x8(x: i32x8) -> i32; + #[link_name = "llvm.experimental.vector.reduce.xor.u32.v8u32"] + fn reduce_xor_u32x8(x: u32x8) -> u32; + #[link_name = "llvm.experimental.vector.reduce.xor.i64.v8i64"] + fn reduce_xor_i64x8(x: i64x8) -> i64; + #[link_name = "llvm.experimental.vector.reduce.xor.u64.v8u64"] + fn reduce_xor_u64x8(x: u64x8) -> u64; + #[link_name = "llvm.experimental.vector.reduce.xor.i8.v16i8"] + fn reduce_xor_i8x16(x: i8x16) -> i8; + #[link_name = "llvm.experimental.vector.reduce.xor.u8.v16u8"] + fn reduce_xor_u8x16(x: u8x16) -> u8; + #[link_name = "llvm.experimental.vector.reduce.xor.i16.v16i16"] + fn reduce_xor_i16x16(x: i16x16) -> i16; + #[link_name = "llvm.experimental.vector.reduce.xor.u16.v16u16"] + fn reduce_xor_u16x16(x: u16x16) -> u16; + #[link_name = "llvm.experimental.vector.reduce.xor.i32.v16i32"] + fn reduce_xor_i32x16(x: i32x16) -> i32; + #[link_name = "llvm.experimental.vector.reduce.xor.u32.v16u32"] + fn reduce_xor_u32x16(x: u32x16) -> u32; + #[link_name = "llvm.experimental.vector.reduce.xor.i8.v32i8"] + fn reduce_xor_i8x32(x: i8x32) -> i8; + #[link_name = "llvm.experimental.vector.reduce.xor.u8.v32u8"] + fn reduce_xor_u8x32(x: u8x32) -> u8; + #[link_name = "llvm.experimental.vector.reduce.xor.i16.v32i16"] + fn reduce_xor_i16x32(x: i16x32) -> i16; + #[link_name = "llvm.experimental.vector.reduce.xor.u16.v32u16"] + fn reduce_xor_u16x32(x: u16x32) -> u16; + #[link_name = "llvm.experimental.vector.reduce.xor.i8.v64i8"] + fn reduce_xor_i8x64(x: i8x64) -> i8; + #[link_name = "llvm.experimental.vector.reduce.xor.u8.v64u8"] + fn reduce_xor_u8x64(x: u8x64) -> u8; +} + +/// Reduction: horizontal bitwise xor of the vector elements. +#[cfg_attr(feature = "cargo-clippy", allow(stutter))] +pub trait ReduceXor { + /// Result type of the reduction. + type Acc; + /// Computes the horizontal bitwise xor of the vector elements. + fn reduce_xor(self) -> Self::Acc; +} + +macro_rules! red_xor { + ($id:ident, $elem_ty:ident, $llvm_intr:ident) => { + impl ReduceXor for $id { + type Acc = $elem_ty; + #[cfg(not(target_arch = "aarch64"))] + #[inline] + fn reduce_xor(self) -> Self::Acc { + unsafe { $llvm_intr(self.into_bits()) } + } + // FIXME: broken in AArch64 + #[cfg(target_arch = "aarch64")] + #[inline] + fn reduce_xor(self) -> Self::Acc { + let mut x = self.extract(0) as Self::Acc; + for i in 1..$id::lanes() { + x ^= self.extract(i) as Self::Acc; + } + x + } + } + }; +} +red_xor!(i8x2, i8, reduce_xor_i8x2); +red_xor!(u8x2, u8, reduce_xor_u8x2); +red_xor!(i16x2, i16, reduce_xor_i16x2); +red_xor!(u16x2, u16, reduce_xor_u16x2); +red_xor!(i32x2, i32, reduce_xor_i32x2); +red_xor!(u32x2, u32, reduce_xor_u32x2); +red_xor!(i64x2, i64, reduce_xor_i64x2); +red_xor!(u64x2, u64, reduce_xor_u64x2); +red_xor!(i8x4, i8, reduce_xor_i8x4); +red_xor!(u8x4, u8, reduce_xor_u8x4); +red_xor!(i16x4, i16, reduce_xor_i16x4); +red_xor!(u16x4, u16, reduce_xor_u16x4); +red_xor!(i32x4, i32, reduce_xor_i32x4); +red_xor!(u32x4, u32, reduce_xor_u32x4); +red_xor!(i64x4, i64, reduce_xor_i64x4); +red_xor!(u64x4, u64, reduce_xor_u64x4); +red_xor!(i8x8, i8, reduce_xor_i8x8); +red_xor!(u8x8, u8, reduce_xor_u8x8); +red_xor!(i16x8, i16, reduce_xor_i16x8); +red_xor!(u16x8, u16, reduce_xor_u16x8); +red_xor!(i32x8, i32, reduce_xor_i32x8); +red_xor!(u32x8, u32, reduce_xor_u32x8); +red_xor!(i64x8, i64, reduce_xor_i64x8); +red_xor!(u64x8, u64, reduce_xor_u64x8); +red_xor!(i8x16, i8, reduce_xor_i8x16); +red_xor!(u8x16, u8, reduce_xor_u8x16); +red_xor!(i16x16, i16, reduce_xor_i16x16); +red_xor!(u16x16, u16, reduce_xor_u16x16); +red_xor!(i32x16, i32, reduce_xor_i32x16); +red_xor!(u32x16, u32, reduce_xor_u32x16); +red_xor!(i8x32, i8, reduce_xor_i8x32); +red_xor!(u8x32, u8, reduce_xor_u8x32); +red_xor!(i16x32, i16, reduce_xor_i16x32); +red_xor!(u16x32, u16, reduce_xor_u16x32); +red_xor!(i8x64, i8, reduce_xor_i8x64); +red_xor!(u8x64, u8, reduce_xor_u8x64); + +red_xor!(b8x2, i8, reduce_xor_i8x2); +red_xor!(b8x4, i8, reduce_xor_i8x4); +red_xor!(b8x8, i8, reduce_xor_i8x8); +red_xor!(b8x16, i8, reduce_xor_i8x16); +red_xor!(b8x32, i8, reduce_xor_i8x32); +red_xor!(b8x64, i8, reduce_xor_i8x64); + +#[cfg(test)] +mod tests { + use super::ReduceXor; + use ::coresimd::simd::*; + + // note: these are tested in the portable vector API tests + + #[test] + fn reduce_xor_i32x4() { + let v = i32x4::splat(1); + assert_eq!(v.reduce_xor(), 0_i32); + let v = i32x4::new(1, 0, 0, 0); + assert_eq!(v.reduce_xor(), 1_i32); + } +} diff --git a/coresimd/ppsv/mod.rs b/coresimd/ppsv/mod.rs new file mode 100644 index 0000000000..69de61d906 --- /dev/null +++ b/coresimd/ppsv/mod.rs @@ -0,0 +1,82 @@ +//! Portable Packed-SIMD Vectors. +//! +//! These types are: +//! +//! * portable: work correctly on all architectures, +//! * packed: have a size fixed at compile-time. +//! +//! These two terms are the opposites of: +//! +//! * architecture-specific: only available in a particular architecture, +//! * scalable: the vector's size is dynamic. +//! +//! This module is structured as follows: +//! +//! * `api`: defines the API of the portable packed vector types. +//! * `v{width}`: defines the portable vector types for a particular `width`. +//! +//! The portable packed vector types are named using the following schema: +//! `{t}{l_w}x{l_n}`: +//! +//! * `t`: type - single letter corresponding to the following Rust literal +//! types: * `i`: signed integer +//! * `u`: unsigned integer +//! * `f`: floating point +//! * `b`: boolean +//! * `l_w`: lane width in bits +//! * `l_n`: number of lanes +//! +//! For example, `f32x4` is a vector type containing four 32-bit wide +//! floating-point numbers. The total width of this type is 32 bit times 4 +//! lanes, that is, 128 bits, and is thus defined in the `v128` module. + +#[macro_use] +mod api; +mod codegen; + +mod v16; +mod v32; +mod v64; +mod v128; +mod v256; +mod v512; + +pub use self::v16::*; +pub use self::v32::*; +pub use self::v64::*; +pub use self::v128::*; +pub use self::v256::*; +pub use self::v512::*; + +use marker; + +/// Safe lossless bitwise conversion from `T` to `Self`. +pub trait FromBits: marker::Sized { + /// Safe lossless bitwise from `T` to `Self`. + fn from_bits(T) -> Self; +} + +/// Safe lossless bitwise conversion from `Self` to `T`. +pub trait IntoBits: marker::Sized { + /// Safe lossless bitwise transmute from `self` to `T`. + fn into_bits(self) -> T; +} + +// FromBits implies IntoBits +impl IntoBits for T +where + U: FromBits, +{ + #[inline] + fn into_bits(self) -> U { + U::from_bits(self) + } +} + +// FromBits (and thus IntoBits) is reflexive +impl FromBits for T { + #[inline] + fn from_bits(t: Self) -> Self { + t + } +} diff --git a/coresimd/ppsv/v128.rs b/coresimd/ppsv/v128.rs new file mode 100644 index 0000000000..8a1293a16b --- /dev/null +++ b/coresimd/ppsv/v128.rs @@ -0,0 +1,338 @@ +//! 128-bit wide portable packed vector types. + +simd_api_imports!(); + +use ::coresimd::simd::{b8x2, b8x4, b8x8}; + +simd_i_ty! { + i8x16: 16, i8, b8x16, i8x16_tests | + i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 | + x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 | + /// A 128-bit vector with 16 `i8` lanes. +} + +simd_u_ty! { + u8x16: 16, u8, b8x16, u8x16_tests | + u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8 | + x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 | + /// A 128-bit vector with 16 `u8` lanes. +} + +simd_b_ty! { + b8x16: 16, i8, b8x16_tests | + i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 | + x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 | + /// A 128-bit vector with 16 `bool` lanes. +} + +simd_i_ty! { + i16x8: 8, i16, b8x8, i16x8_tests | + i16, i16, i16, i16, i16, i16, i16, i16 | + x0, x1, x2, x3, x4, x5, x6, x7 | + /// A 128-bit vector with 8 `i16` lanes. +} + +simd_u_ty! { + u16x8: 8, u16, b8x8, u16x8_tests | + u16, u16, u16, u16, u16, u16, u16, u16 | + x0, x1, x2, x3, x4, x5, x6, x7 | + /// A 128-bit vector with 8 `u16` lanes. +} + +simd_i_ty! { + i32x4: 4, i32, b8x4, i32x4_tests | + i32, i32, i32, i32 | + x0, x1, x2, x3 | + /// A 128-bit vector with 4 `i32` lanes. +} + +simd_u_ty! { + u32x4: 4, u32, b8x4, u32x4_tests | + u32, u32, u32, u32 | + x0, x1, x2, x3 | + /// A 128-bit vector with 4 `u32` lanes. +} + +simd_f_ty! { + f32x4: 4, f32, b8x4, f32x4_tests | + f32, f32, f32, f32 | + x0, x1, x2, x3 | + /// A 128-bit vector with 4 `f32` lanes. +} + +simd_i_ty! { + i64x2: 2, i64, b8x2, i64x2_tests | + i64, i64 | + x0, x1 | + /// A 128-bit vector with 2 `u64` lanes. +} + +simd_u_ty! { + u64x2: 2, u64, b8x2, u64x2_tests | + u64, u64 | + x0, x1 | + /// A 128-bit vector with 2 `u64` lanes. +} + +simd_f_ty! { + f64x2: 2, f64, b8x2, f64x2_tests | + f64, f64 | + x0, x1 | + /// A 128-bit vector with 2 `f64` lanes. +} + +impl_from_bits!( + u64x2: u64, + u64x2_from_bits | i64x2, + f64x2, + u32x4, + i32x4, + f32x4, + u16x8, + i16x8, + u8x16, + i8x16, + b8x16 +); +impl_from_bits!( + i64x2: i64, + i64x2_from_bits | u64x2, + f64x2, + u32x4, + i32x4, + f32x4, + u16x8, + i16x8, + u8x16, + i8x16, + b8x16 +); +impl_from_bits!( + f64x2: f64, + f64x2_from_bits | i64x2, + u64x2, + u32x4, + i32x4, + f32x4, + u16x8, + i16x8, + u8x16, + i8x16, + b8x16 +); +impl_from_bits!( + u32x4: u32, + u32x4_from_bits | u64x2, + i64x2, + f64x2, + i32x4, + f32x4, + u16x8, + i16x8, + u8x16, + i8x16, + b8x16 +); +impl_from_bits!( + i32x4: i32, + i32x4_from_bits | u64x2, + i64x2, + f64x2, + u32x4, + f32x4, + u16x8, + i16x8, + u8x16, + i8x16, + b8x16 +); +impl_from_bits!( + f32x4: f32, + f32x4_from_bits | u64x2, + i64x2, + f64x2, + i32x4, + u32x4, + u16x8, + i16x8, + u8x16, + i8x16, + b8x16 +); +impl_from_bits!( + u16x8: u16, + u16x8_from_bits | u64x2, + i64x2, + f64x2, + u32x4, + i32x4, + f32x4, + i16x8, + u8x16, + i8x16, + b8x16 +); +impl_from_bits!( + i16x8: i16, + i16x8_from_bits | u64x2, + i64x2, + f64x2, + u32x4, + i32x4, + f32x4, + u16x8, + u8x16, + i8x16, + b8x16 +); +impl_from_bits!( + u8x16: u8, + u8x16_from_bits | u64x2, + i64x2, + f64x2, + u32x4, + i32x4, + f32x4, + u16x8, + i16x8, + i8x16, + b8x16 +); +impl_from_bits!( + i8x16: i8, + i8x16_from_bits | u64x2, + i64x2, + f64x2, + u32x4, + i32x4, + f32x4, + u16x8, + i16x8, + u8x16, + b8x16 +); + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +use coresimd::x86::__m128; +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +use coresimd::x86::__m128i; +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +use coresimd::x86::__m128d; + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +impl_from_bits_!(f64x2: __m128, __m128i, __m128d); +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +impl_from_bits_!(u64x2: __m128, __m128i, __m128d); +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +impl_from_bits_!(i64x2: __m128, __m128i, __m128d); +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +impl_from_bits_!(f32x4: __m128, __m128i, __m128d); +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +impl_from_bits_!(u32x4: __m128, __m128i, __m128d); +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +impl_from_bits_!(i32x4: __m128, __m128i, __m128d); +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +impl_from_bits_!(u16x8: __m128, __m128i, __m128d); +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +impl_from_bits_!(i16x8: __m128, __m128i, __m128d); +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +impl_from_bits_!(u8x16: __m128, __m128i, __m128d); +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +impl_from_bits_!(i8x16: __m128, __m128i, __m128d); + +impl_from!( + f64x2: f64, + f64x2_from | f32x2, + u64x2, + i64x2, + u32x2, + i32x2, + u16x2, + i16x2, + u8x2, + i8x2 +); +impl_from!( + f32x4: f32, + f32x4_from | f64x4, + u64x4, + i64x4, + u32x4, + i32x4, + u16x4, + i16x4, + u8x4, + i8x4 +); +impl_from!( + u64x2: u64, + u64x2_from | f32x2, + f64x2, + i64x2, + i32x2, + u32x2, + i16x2, + u16x2, + i8x2, + u8x2 +); +impl_from!( + i64x2: i64, + i64x2_from | f32x2, + f64x2, + u64x2, + i32x2, + u32x2, + i16x2, + u16x2, + i8x2, + u8x2 +); +impl_from!( + u32x4: u32, + u32x4_from | f64x4, + u64x4, + i64x4, + f32x4, + i32x4, + u16x4, + i16x4, + u8x4, + i8x4 +); +impl_from!( + i32x4: i32, + i32x4_from | f64x4, + u64x4, + i64x4, + f32x4, + u32x4, + u16x4, + i16x4, + u8x4, + i8x4 +); +impl_from!( + i16x8: i16, + i16x8_from | f64x8, + u64x8, + i64x8, + f32x8, + u32x8, + i32x8, + u16x8, + u8x8, + i8x8 +); +impl_from!( + u16x8: u16, + u16x8_from | f64x8, + u64x8, + i64x8, + f32x8, + u32x8, + i32x8, + i16x8, + u8x8, + i8x8 +); diff --git a/coresimd/ppsv/v16.rs b/coresimd/ppsv/v16.rs new file mode 100644 index 0000000000..5bde9079f9 --- /dev/null +++ b/coresimd/ppsv/v16.rs @@ -0,0 +1,50 @@ +//! 16-bit wide portable packed vector types. + +simd_api_imports!(); + +simd_i_ty! { + i8x2: 2, i8, b8x2, i8x2_tests | + i8, i8 | + x0, x1 | + /// A 16-bit wide vector with 2 `i8` lanes. +} + +simd_u_ty! { + u8x2: 2, u8, b8x2, u8x2_tests | + u8, u8 | + x0, x1 | + /// A 16-bit wide vector with 2 `u8` lanes. +} + +simd_b_ty! { + b8x2: 2, i8, b8x2_tests | + i8, i8 | + x0, x1 | + /// A 16-bit wide vector with 2 `bool` lanes. +} + +impl_from_bits!(i8x2: i8, i8x2_from_bits | u8x2, b8x2); +impl_from_bits!(u8x2: u8, u8x2_from_bits | i8x2, b8x2); + +impl_from!( + i8x2: i8, + i8x2_from | f64x2, + u64x2, + i64x2, + f32x2, + u32x2, + i32x2, + u16x2, + u8x2 +); +impl_from!( + u8x2: u8, + u8x2_from | f64x2, + u64x2, + i64x2, + f32x2, + u32x2, + i32x2, + u16x2, + i8x2 +); diff --git a/coresimd/ppsv/v256.rs b/coresimd/ppsv/v256.rs new file mode 100644 index 0000000000..da7cdb92bd --- /dev/null +++ b/coresimd/ppsv/v256.rs @@ -0,0 +1,350 @@ +//! 256-bit wide portable packed vector types. + +simd_api_imports!(); + +use ::coresimd::simd::{b8x16, b8x8, b8x4}; + +simd_i_ty! { + i8x32: 32, i8, b8x32, i8x32_tests | + i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, + i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 | + x0, x1, x2, x3, x4, x5, x6, x7, + x8, x9, x10, x11, x12, x13, x14, x15, + x16, x17, x18, x19, x20, x21, x22, x23, + x24, x25, x26, x27, x28, x29, x30, x31 | + /// A 256-bit vector with 32 `i8` lanes. +} + +simd_u_ty! { + u8x32: 32, u8, b8x32, u8x32_tests | + u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, + u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8 | + x0, x1, x2, x3, x4, x5, x6, x7, + x8, x9, x10, x11, x12, x13, x14, x15, + x16, x17, x18, x19, x20, x21, x22, x23, + x24, x25, x26, x27, x28, x29, x30, x31 | + /// A 256-bit vector with 32 `u8` lanes. +} + +simd_b_ty! { + b8x32: 32, i8, b8x32_tests | + i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, + i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 | + x0, x1, x2, x3, x4, x5, x6, x7, + x8, x9, x10, x11, x12, x13, x14, x15, + x16, x17, x18, x19, x20, x21, x22, x23, + x24, x25, x26, x27, x28, x29, x30, x31 | + /// A 256-bit vector with 32 `bool` lanes. +} + +simd_i_ty! { + i16x16: 16, i16, b8x16, i16x16_tests | + i16, i16, i16, i16, i16, i16, i16, i16, + i16, i16, i16, i16, i16, i16, i16, i16 | + x0, x1, x2, x3, x4, x5, x6, x7, + x8, x9, x10, x11, x12, x13, x14, x15 | + /// A 256-bit vector with 16 `i16` lanes. +} + +simd_u_ty! { + u16x16: 16, u16, b8x16, u16x16_tests | + u16, u16, u16, u16, u16, u16, u16, u16, + u16, u16, u16, u16, u16, u16, u16, u16 | + x0, x1, x2, x3, x4, x5, x6, x7, + x8, x9, x10, x11, x12, x13, x14, x15 | + /// A 256-bit vector with 16 `u16` lanes. +} + +simd_i_ty! { + i32x8: 8, i32, b8x8, i32x8_tests | + i32, i32, i32, i32, i32, i32, i32, i32 | + x0, x1, x2, x3, x4, x5, x6, x7 | + /// A 256-bit vector with 8 `i32` lanes. +} + +simd_u_ty! { + u32x8: 8, u32, b8x8, u32x8_tests | + u32, u32, u32, u32, u32, u32, u32, u32 | + x0, x1, x2, x3, x4, x5, x6, x7 | + /// A 256-bit vector with 8 `u32` lanes. +} + +simd_f_ty! { + f32x8: 8, f32, b8x8, f32x8_tests | + f32, f32, f32, f32, f32, f32, f32, f32 | + x0, x1, x2, x3, x4, x5, x6, x7 | + /// A 256-bit vector with 8 `f32` lanes. +} + +simd_i_ty! { + i64x4: 4, i64, b8x4, i64x4_tests | + i64, i64, i64, i64 | + x0, x1, x2, x3 | + /// A 256-bit vector with 4 `i64` lanes. +} + +simd_u_ty! { + u64x4: 4, u64, b8x4, u64x4_tests | + u64, u64, u64, u64 | + x0, x1, x2, x3 | + /// A 256-bit vector with 4 `u64` lanes. +} + +simd_f_ty! { + f64x4: 4, f64, b8x4, f64x4_tests | + f64, f64, f64, f64 | + x0, x1, x2, x3 | + /// A 256-bit vector with 4 `f64` lanes. +} + +impl_from_bits!( + i8x32: i8, + i8x32_from_bits | u64x4, + i64x4, + f64x4, + u32x8, + i32x8, + f32x8, + u16x16, + i16x16, + u8x32, + b8x32 +); +impl_from_bits!( + u8x32: u8, + u8x32_from_bits | u64x4, + i64x4, + f64x4, + u32x8, + i32x8, + f32x8, + u16x16, + i16x16, + i8x32, + b8x32 +); +impl_from_bits!( + i16x16: i16, + i16x16_from_bits | u64x4, + i64x4, + f64x4, + u32x8, + i32x8, + f32x8, + u16x16, + u8x32, + i8x32, + b8x32 +); +impl_from_bits!( + u16x16: u16, + u16x16_from_bits | u64x4, + i64x4, + f64x4, + u32x8, + i32x8, + f32x8, + i16x16, + u8x32, + i8x32, + b8x32 +); +impl_from_bits!( + i32x8: i32, + i32x8_from_bits | u64x4, + i64x4, + f64x4, + u32x8, + f32x8, + u16x16, + i16x16, + u8x32, + i8x32, + b8x32 +); +impl_from_bits!( + u32x8: u32, + u32x8_from_bits | u64x4, + i64x4, + f64x4, + i32x8, + f32x8, + u16x16, + i16x16, + u8x32, + i8x32, + b8x32 +); +impl_from_bits!( + f32x8: f32, + f32x8_from_bits | u64x4, + i64x4, + f64x4, + i32x8, + u32x8, + u16x16, + i16x16, + u8x32, + i8x32, + b8x32 +); +impl_from_bits!( + i64x4: i64, + i64x4_from_bits | u64x4, + f64x4, + i32x8, + u32x8, + f32x8, + u16x16, + i16x16, + u8x32, + i8x32, + b8x32 +); +impl_from_bits!( + u64x4: u64, + u64x4_from_bits | i64x4, + f64x4, + i32x8, + u32x8, + f32x8, + u16x16, + i16x16, + u8x32, + i8x32, + b8x32 +); +impl_from_bits!( + f64x4: f64, + f64x4_from_bits | i64x4, + u64x4, + i32x8, + u32x8, + f32x8, + u16x16, + i16x16, + u8x32, + i8x32, + b8x32 +); + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +use coresimd::x86::__m256; +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +use coresimd::x86::__m256i; +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +use coresimd::x86::__m256d; + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +impl_from_bits_!(f64x4: __m256, __m256i, __m256d); +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +impl_from_bits_!(u64x4: __m256, __m256i, __m256d); +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +impl_from_bits_!(i64x4: __m256, __m256i, __m256d); +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +impl_from_bits_!(f32x8: __m256, __m256i, __m256d); +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +impl_from_bits_!(u32x8: __m256, __m256i, __m256d); +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +impl_from_bits_!(i32x8: __m256, __m256i, __m256d); +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +impl_from_bits_!(u16x16: __m256, __m256i, __m256d); +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +impl_from_bits_!(i16x16: __m256, __m256i, __m256d); +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +impl_from_bits_!(u8x32: __m256, __m256i, __m256d); +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +impl_from_bits_!(i8x32: __m256, __m256i, __m256d); + +impl_from!( + f64x4: f64, + f64x4_from | u64x4, + i64x4, + u32x4, + i32x4, + f32x4, + u16x4, + i16x4, + u8x4, + i8x4 +); +impl_from!( + i64x4: i64, + i64x4_from | u64x4, + f64x4, + u32x4, + i32x4, + f32x4, + u16x4, + i16x4, + u8x4, + i8x4 +); +impl_from!( + u64x4: u64, + u64x4_from | i64x4, + f64x4, + u32x4, + i32x4, + f32x4, + u16x4, + i16x4, + u8x4, + i8x4 +); +impl_from!( + f32x8: f32, + f32x8_from | u64x8, + i64x8, + f64x8, + u32x8, + i32x8, + u16x8, + i16x8, + u8x8, + i8x8 +); +impl_from!( + i32x8: i32, + i32x8_from | u64x8, + i64x8, + f64x8, + u32x8, + f32x8, + u16x8, + i16x8, + u8x8, + i8x8 +); +impl_from!( + u32x8: u32, + u32x8_from | u64x8, + i64x8, + f64x8, + i32x8, + f32x8, + u16x8, + i16x8, + u8x8, + i8x8 +); +impl_from!( + i16x16: i16, + i16x16_from | u32x16, + i32x16, + f32x16, + u16x16, + u8x16, + i8x16 +); +impl_from!( + u16x16: u16, + u16x16_from | u32x16, + i32x16, + f32x16, + i16x16, + u8x16, + i8x16 +); +impl_from!(i8x32: i8, i8x32_from | u16x32, i16x32, u8x32); +impl_from!(u8x32: u8, u8x32_from | u16x32, i16x32, i8x32); diff --git a/coresimd/ppsv/v32.rs b/coresimd/ppsv/v32.rs new file mode 100644 index 0000000000..be772bd6a8 --- /dev/null +++ b/coresimd/ppsv/v32.rs @@ -0,0 +1,96 @@ +//! 32-bit wide portable packed vector types. + +simd_api_imports!(); +use ::coresimd::simd::{b8x2}; + +simd_i_ty! { + i16x2: 2, i16, b8x2, i16x2_tests | + i16, i16 | + x0, x1 | + /// A 32-bit wide vector with 2 `i16` lanes. +} + +simd_u_ty! { + u16x2: 2, u16, b8x2, u16x2_tests | + u16, u16 | + x0, x1 | + /// A 32-bit wide vector with 2 `u16` lanes. +} + +simd_i_ty! { + i8x4: 4, i8, b8x4, i8x4_tests | + i8, i8, i8, i8 | + x0, x1, x2, x3 | + /// A 32-bit wide vector with 4 `i8` lanes. +} + +simd_u_ty! { + u8x4: 4, u8, b8x4, u8x4_tests | + u8, u8, u8, u8 | + x0, x1, x2, x3 | + /// A 32-bit wide vector with 4 `u8` lanes. +} + +simd_b_ty! { + b8x4: 4, i8, b8x4_tests | + i8, i8, i8, i8 | + x0, x1, x2, x3 | + /// A 32-bit wide vector with 4 `bool` lanes. +} + +impl_from_bits!(i16x2: i16, i16x2_from_bits | u16x2, i8x4, u8x4, b8x4); +impl_from_bits!(u16x2: u16, u16x2_from_bits | i16x2, i8x4, u8x4, b8x4); +impl_from_bits!(i8x4: i8, i8x2_from_bits | i16x2, u16x2, u8x4, b8x4); +impl_from_bits!(u8x4: u8, u8x2_from_bits | i16x2, u16x2, i8x4, b8x4); + +impl_from!( + i16x2: i16, + i16x2_from | f64x2, + u64x2, + i64x2, + f32x2, + u32x2, + i32x2, + u16x2, + u8x2, + i8x2 +); + +impl_from!( + u16x2: u16, + u16x2_from | f64x2, + u64x2, + i64x2, + f32x2, + u32x2, + i32x2, + i16x2, + u8x2, + i8x2 +); + +impl_from!( + i8x4: i8, + i8x4_from | f64x4, + u64x4, + i64x4, + u32x4, + i32x4, + f32x4, + u16x4, + i16x4, + u8x4 +); + +impl_from!( + u8x4: u8, + u8x4_from | f64x4, + u64x4, + i64x4, + u32x4, + i32x4, + f32x4, + u16x4, + i16x4, + i8x4 +); diff --git a/coresimd/ppsv/v512.rs b/coresimd/ppsv/v512.rs new file mode 100644 index 0000000000..c9e1c0e7d2 --- /dev/null +++ b/coresimd/ppsv/v512.rs @@ -0,0 +1,331 @@ +//! 512-bit wide portable packed vector types. + +simd_api_imports!(); + +use ::coresimd::simd::{b8x32, b8x16, b8x8}; + +simd_i_ty! { + i8x64: 64, i8, b8x64, i8x64_tests | + i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, + i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, + i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, + i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 | + x0, x1, x2, x3, x4, x5, x6, x7, + x8, x9, x10, x11, x12, x13, x14, x15, + x16, x17, x18, x19, x20, x21, x22, x23, + x24, x25, x26, x27, x28, x29, x30, x31, + x32, x33, x34, x35, x36, x37, x38, x39, + x40, x41, x42, x43, x44, x45, x46, x47, + x48, x49, x50, x51, x52, x53, x54, x55, + x56, x57, x58, x59, x60, x61, x62, x63 | + /// A 512-bit vector with 64 `i8` lanes. +} + +simd_u_ty! { + u8x64: 64, u8, b8x64, u8x64_tests | + u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, + u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, + u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, + u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8 | + x0, x1, x2, x3, x4, x5, x6, x7, + x8, x9, x10, x11, x12, x13, x14, x15, + x16, x17, x18, x19, x20, x21, x22, x23, + x24, x25, x26, x27, x28, x29, x30, x31, + x32, x33, x34, x35, x36, x37, x38, x39, + x40, x41, x42, x43, x44, x45, x46, x47, + x48, x49, x50, x51, x52, x53, x54, x55, + x56, x57, x58, x59, x60, x61, x62, x63 | + /// A 512-bit vector with 64 `u8` lanes. +} + +simd_b_ty! { + b8x64: 64, i8, b8x64_tests | + i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, + i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, + i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, + i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 | + x0, x1, x2, x3, x4, x5, x6, x7, + x8, x9, x10, x11, x12, x13, x14, x15, + x16, x17, x18, x19, x20, x21, x22, x23, + x24, x25, x26, x27, x28, x29, x30, x31, + x32, x33, x34, x35, x36, x37, x38, x39, + x40, x41, x42, x43, x44, x45, x46, x47, + x48, x49, x50, x51, x52, x53, x54, x55, + x56, x57, x58, x59, x60, x61, x62, x63 | + /// A 512-bit vector with 64 `bool` lanes. +} + +simd_i_ty! { + i16x32: 32, i16, b8x32, i16x32_tests | + i16, i16, i16, i16, i16, i16, i16, i16, + i16, i16, i16, i16, i16, i16, i16, i16, + i16, i16, i16, i16, i16, i16, i16, i16, + i16, i16, i16, i16, i16, i16, i16, i16 | + x0, x1, x2, x3, x4, x5, x6, x7, + x8, x9, x10, x11, x12, x13, x14, x15, + x16, x17, x18, x19, x20, x21, x22, x23, + x24, x25, x26, x27, x28, x29, x30, x31 | + /// A 512-bit vector with 32 `i16` lanes. +} + +simd_u_ty! { + u16x32: 32, u16, b8x32, u16x32_tests | + u16, u16, u16, u16, u16, u16, u16, u16, + u16, u16, u16, u16, u16, u16, u16, u16, + u16, u16, u16, u16, u16, u16, u16, u16, + u16, u16, u16, u16, u16, u16, u16, u16 | + x0, x1, x2, x3, x4, x5, x6, x7, + x8, x9, x10, x11, x12, x13, x14, x15, + x16, x17, x18, x19, x20, x21, x22, x23, + x24, x25, x26, x27, x28, x29, x30, x31 | + /// A 512-bit vector with 32 `u16` lanes. +} +simd_i_ty! { + i32x16: 16, i32, b8x16, i32x16_tests | + i32, i32, i32, i32, i32, i32, i32, i32, + i32, i32, i32, i32, i32, i32, i32, i32 | + x0, x1, x2, x3, x4, x5, x6, x7, + x8, x9, x10, x11, x12, x13, x14, x15 | + /// A 512-bit vector with 16 `i32` lanes. +} + +simd_u_ty! { + u32x16: 16, u32, b8x16, u32x16_tests | + u32, u32, u32, u32, u32, u32, u32, u32, + u32, u32, u32, u32, u32, u32, u32, u32 | + x0, x1, x2, x3, x4, x5, x6, x7, + x8, x9, x10, x11, x12, x13, x14, x15 | + /// A 512-bit vector with 16 `u32` lanes. +} + +simd_f_ty! { + f32x16: 16, f32, b8x16, f32x16_tests | + f32, f32, f32, f32, f32, f32, f32, f32, + f32, f32, f32, f32, f32, f32, f32, f32 | + x0, x1, x2, x3, x4, x5, x6, x7, + x8, x9, x10, x11, x12, x13, x14, x15 | + /// A 512-bit vector with 16 `f32` lanes. +} + +simd_i_ty! { + i64x8: 8, i64, b8x8, i64x8_tests | + i64, i64, i64, i64, i64, i64, i64, i64 | + x0, x1, x2, x3, x4, x5, x6, x7 | + /// A 512-bit vector with 8 `i64` lanes. +} + +simd_u_ty! { + u64x8: 8, u64, b8x8, u64x8_tests | + u64, u64, u64, u64, u64, u64, u64, u64 | + x0, x1, x2, x3, x4, x5, x6, x7 | + /// A 512-bit vector with 8 `u64` lanes. +} + +simd_f_ty! { + f64x8: 8, f64, b8x8, f64x8_tests | + f64, f64, f64, f64, f64, f64, f64, f64 | + x0, x1, x2, x3, x4, x5, x6, x7 | + /// A 512-bit vector with 8 `f64` lanes. +} + +impl_from_bits!( + i8x64: i8, + i8x64_from_bits | u64x8, + i64x8, + f64x8, + u32x16, + i32x16, + f32x16, + u16x32, + i16x32, + u8x64, + b8x64 +); +impl_from_bits!( + u8x64: u8, + u8x64_from_bits | u64x8, + i64x8, + f64x8, + u32x16, + i32x16, + f32x16, + u16x32, + i16x32, + i8x64, + b8x64 +); +impl_from_bits!( + i16x32: i16, + i16x32_from_bits | u64x8, + i64x8, + f64x8, + u32x16, + i32x16, + f32x16, + u16x32, + i8x64, + u8x64, + b8x64 +); +impl_from_bits!( + u16x32: u16, + u16x32_from_bits | u64x8, + i64x8, + f64x8, + u32x16, + i32x16, + f32x16, + i16x32, + i8x64, + u8x64, + b8x64 +); +impl_from_bits!( + i32x16: i32, + i32x16_from_bits | u64x8, + i64x8, + f64x8, + u32x16, + f32x16, + u16x32, + i16x32, + i8x64, + u8x64, + b8x64 +); +impl_from_bits!( + u32x16: u32, + u32x16_from_bits | u64x8, + i64x8, + f64x8, + i32x16, + f32x16, + u16x32, + i16x32, + i8x64, + u8x64, + b8x64 +); +impl_from_bits!( + f32x16: f32, + f32x16_from_bits | u64x8, + i64x8, + f64x8, + u32x16, + i32x16, + u16x32, + i16x32, + i8x64, + u8x64, + b8x64 +); +impl_from_bits!( + i64x8: i64, + i64x8_from_bits | u64x8, + f64x8, + u32x16, + i32x16, + f32x16, + u16x32, + i16x32, + i8x64, + u8x64, + b8x64 +); +impl_from_bits!( + u64x8: u64, + u64x8_from_bits | i64x8, + f64x8, + u32x16, + i32x16, + f32x16, + u16x32, + i16x32, + i8x64, + u8x64, + b8x64 +); +impl_from_bits!( + f64x8: f64, + f64x8_from_bits | u64x8, + i64x8, + u32x16, + i32x16, + f32x16, + u16x32, + i16x32, + i8x64, + u8x64, + b8x64 +); + +impl_from!( + f64x8: f64, + f64x8_from | u64x8, + i64x8, + u32x8, + i32x8, + f32x8, + u16x8, + i16x8, + u8x8, + i8x8 +); +impl_from!( + i64x8: i64, + i64x8_from | u64x8, + f64x8, + u32x8, + i32x8, + f32x8, + u16x8, + i16x8, + u8x8, + i8x8 +); +impl_from!( + u64x8: u64, + u64x8_from | i64x8, + f64x8, + u32x8, + i32x8, + f32x8, + u16x8, + i16x8, + u8x8, + i8x8 +); + +impl_from!( + f32x16: f32, + f32x16_from | u32x16, + i32x16, + u16x16, + i16x16, + u8x16, + i8x16 +); +impl_from!( + i32x16: i32, + i32x16_from | u32x16, + f32x16, + u16x16, + i16x16, + u8x16, + i8x16 +); +impl_from!( + u32x16: u32, + u32x16_from | i32x16, + f32x16, + u16x16, + i16x16, + u8x16, + i8x16 +); + +impl_from!(i16x32: i16, i16x32_from | u16x32, u8x32, i8x32); +impl_from!(u16x32: u16, u16x32_from | i16x32, u8x32, i8x32); + +impl_from!(i8x64: i8, i8x64_from | u8x64); +impl_from!(u8x64: u8, u8x64_from | i8x64); diff --git a/coresimd/ppsv/v64.rs b/coresimd/ppsv/v64.rs new file mode 100644 index 0000000000..874a12f45a --- /dev/null +++ b/coresimd/ppsv/v64.rs @@ -0,0 +1,235 @@ +//! 64-bit wide portable packed vector types. + +simd_api_imports!(); + +use ::coresimd::simd::{b8x4, b8x2}; + +simd_i_ty! { + i8x8: 8, i8, b8x8, i8x8_tests | + i8, i8, i8, i8, i8, i8, i8, i8 | + x0, x1, x2, x3, x4, x5, x6, x7 | + /// A 64-bit vector with 8 `i8` lanes. +} + +simd_u_ty! { + u8x8: 8, u8, b8x8, u8x8_tests | + u8, u8, u8, u8, u8, u8, u8, u8 | + x0, x1, x2, x3, x4, x5, x6, x7 | + /// A 64-bit vector with 8 `u8` lanes. +} + +simd_b_ty! { + b8x8: 8, i8, b8x8_tests | + i8, i8, i8, i8, i8, i8, i8, i8 | + x0, x1, x2, x3, x4, x5, x6, x7 | + /// A 64-bit vector with 8 `bool` lanes. +} + +simd_i_ty! { + i16x4: 4, i16, b8x4, i16x4_tests | + i16, i16, i16, i16 | + x0, x1, x2, x3 | + /// A 64-bit vector with 4 `i16` lanes. +} + +simd_u_ty! { + u16x4: 4, u16, b8x4, u16x4_tests | + u16, u16, u16, u16 | + x0, x1, x2, x3 | + /// A 64-bit vector with 4 `u16` lanes. +} + +simd_i_ty! { + i32x2: 2, i32, b8x2, i32x2_tests | + i32, i32 | + x0, x1 | + /// A 64-bit vector with 2 `i32` lanes. +} + +simd_u_ty! { + u32x2: 2, u32, b8x2, u32x2_tests | + u32, u32 | + x0, x1 | + /// A 64-bit vector with 2 `u32` lanes. +} + +simd_f_ty! { + f32x2: 2, f32, b8x2, f32x2_tests | + f32, f32 | + x0, x1 | + /// A 64-bit vector with 2 `f32` lanes. +} + +impl_from_bits!( + u32x2: u32, + u32x2_from_bits | i32x2, + f32x2, + u16x4, + i16x4, + u8x8, + i8x8, + b8x8 +); +impl_from_bits!( + i32x2: i32, + i32x2_from_bits | u32x2, + f32x2, + u16x4, + i16x4, + u8x8, + i8x8, + b8x8 +); +impl_from_bits!( + f32x2: f32, + f32x2_from_bits | i32x2, + u32x2, + u16x4, + i16x4, + u8x8, + i8x8, + b8x8 +); +impl_from_bits!( + u16x4: u16, + u16x4_from_bits | u32x2, + i32x2, + i16x4, + u8x8, + i8x8, + b8x8 +); +impl_from_bits!( + i16x4: i16, + i16x4_from_bits | u32x2, + i32x2, + u16x4, + u8x8, + i8x8, + b8x8 +); +impl_from_bits!( + u8x8: u8, + u8x8_from_bits | u32x2, + i32x2, + u16x4, + i16x4, + i8x8, + b8x8 +); +impl_from_bits!( + i8x8: i8, + i8x8_from_bits | u32x2, + i32x2, + u16x4, + i16x4, + u8x8, + b8x8 +); + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +use coresimd::x86::__m64; + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +impl_from_bits_!(f32x2: __m64); +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +impl_from_bits_!(u32x2: __m64); +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +impl_from_bits_!(i32x2: __m64); +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +impl_from_bits_!(u16x4: __m64); +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +impl_from_bits_!(i16x4: __m64); +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +impl_from_bits_!(u8x8: __m64); +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +impl_from_bits_!(i8x8: __m64); + +impl_from!( + f32x2: f32, + f32x2_from | f64x2, + u64x2, + i64x2, + u32x2, + i32x2, + u16x2, + i16x2, + u8x2, + i8x2 +); + +impl_from!( + u32x2: u32, + u32x2_from | f64x2, + u64x2, + i64x2, + f32x2, + i32x2, + u16x2, + i16x2, + u8x2, + i8x2 +); + +impl_from!( + i32x2: i32, + i32x2_from | f64x2, + u64x2, + i64x2, + f32x2, + u32x2, + u16x2, + i16x2, + u8x2, + i8x2 +); + +impl_from!( + u16x4: u16, + u16x4_from | f64x4, + u64x4, + i64x4, + f32x4, + i32x4, + u32x4, + i16x4, + u8x4, + i8x4 +); + +impl_from!( + i16x4: i16, + i16x4_from | f64x4, + u64x4, + i64x4, + f32x4, + i32x4, + u32x4, + u16x4, + u8x4, + i8x4 +); +impl_from!( + i8x8: i8, + i8x8_from | f64x8, + u64x8, + i64x8, + f32x8, + u32x8, + i32x8, + i16x8, + u16x8, + u8x8 +); +impl_from!( + u8x8: u8, + u8x8_from | f64x8, + u64x8, + i64x8, + f32x8, + u32x8, + i32x8, + i16x8, + u16x8, + i8x8 +); diff --git a/coresimd/v128.rs b/coresimd/v128.rs deleted file mode 100644 index ccdfc7f662..0000000000 --- a/coresimd/v128.rs +++ /dev/null @@ -1,111 +0,0 @@ -//! 128-bit wide vector types - -use prelude::v1::*; - -use coresimd::simd_llvm::*; - -define_ty! { f64x2, f64, f64 } -define_impl! { f64x2, f64, 2, i64x2, x0, x1 } - -define_ty! { f32x4, f32, f32, f32, f32 } -define_impl! { f32x4, f32, 4, i32x4, x0, x1, x2, x3 } - -define_ty! { u64x2, u64, u64 } -define_impl! { u64x2, u64, 2, i64x2, x0, x1 } - -define_ty! { i64x2, i64, i64 } -define_impl! { i64x2, i64, 2, i64x2, x0, x1 } - -define_ty! { u32x4, u32, u32, u32, u32 } -define_impl! { u32x4, u32, 4, i32x4, x0, x1, x2, x3 } - -define_ty! { i32x4, i32, i32, i32, i32 } -define_impl! { i32x4, i32, 4, i32x4, x0, x1, x2, x3 } - -define_ty! { u16x8, u16, u16, u16, u16, u16, u16, u16, u16 } -define_impl! { u16x8, u16, 8, i16x8, x0, x1, x2, x3, x4, x5, x6, x7 } - -define_ty! { i16x8, i16, i16, i16, i16, i16, i16, i16, i16 } -define_impl! { i16x8, i16, 8, i16x8, x0, x1, x2, x3, x4, x5, x6, x7 } - -define_ty! { - u8x16, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8 -} -define_impl! { - u8x16, u8, 16, i8x16, - x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 -} - -define_ty! { - i8x16, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 -} -define_impl! { - i8x16, i8, 16, i8x16, - x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 -} - -define_from!(u64x2, i64x2, u32x4, i32x4, u16x8, i16x8, u8x16, i8x16); -define_from!(i64x2, u64x2, u32x4, i32x4, u16x8, i16x8, u8x16, i8x16); -define_from!(u32x4, u64x2, i64x2, i32x4, u16x8, i16x8, u8x16, i8x16); -define_from!(i32x4, u64x2, i64x2, u32x4, u16x8, i16x8, u8x16, i8x16); -define_from!(u16x8, u64x2, i64x2, u32x4, i32x4, i16x8, u8x16, i8x16); -define_from!(i16x8, u64x2, i64x2, u32x4, i32x4, u16x8, u8x16, i8x16); -define_from!(u8x16, u64x2, i64x2, u32x4, i32x4, u16x8, i16x8, i8x16); -define_from!(i8x16, u64x2, i64x2, u32x4, i32x4, u16x8, i16x8, u8x16); - -define_common_ops!( - f64x2, - f32x4, - u64x2, - i64x2, - u32x4, - i32x4, - u16x8, - i16x8, - u8x16, - i8x16 -); -define_float_ops!(f64x2, f32x4); -define_integer_ops!( - (u64x2, u64), - (i64x2, i64), - (u32x4, u32), - (i32x4, i32), - (u16x8, u16), - (i16x8, i16), - (u8x16, u8), - (i8x16, i8) -); -define_signed_integer_ops!(i64x2, i32x4, i16x8, i8x16); -define_casts!( - (f64x2, f32x2, as_f32x2), - (f64x2, u64x2, as_u64x2), - (f64x2, i64x2, as_i64x2), - (f32x4, f64x4, as_f64x4), - (f32x4, u32x4, as_u32x4), - (f32x4, i32x4, as_i32x4), - (u64x2, f64x2, as_f64x2), - (u64x2, i64x2, as_i64x2), - (i64x2, f64x2, as_f64x2), - (i64x2, u64x2, as_u64x2), - (u32x4, f32x4, as_f32x4), - (u32x4, i32x4, as_i32x4), - (i32x4, f32x4, as_f32x4), - (i32x4, u32x4, as_u32x4), - (u16x8, i16x8, as_i16x8), - (i16x8, u16x8, as_u16x8), - (u8x16, i8x16, as_i8x16), - (i8x16, u8x16, as_u8x16) -); - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn operators() { - test_ops_si!(i8x16, i16x8, i32x4, i64x2); - test_ops_ui!(u8x16, u16x8, u32x4, u64x2); - test_ops_f!(f32x4, f64x2); - } -} diff --git a/coresimd/v256.rs b/coresimd/v256.rs deleted file mode 100644 index 384ac4ed04..0000000000 --- a/coresimd/v256.rs +++ /dev/null @@ -1,134 +0,0 @@ -//! 256-bit wide vector types - -use prelude::v1::*; - -use coresimd::simd_llvm::*; - -define_ty! { f64x4, f64, f64, f64, f64 } -define_impl! { f64x4, f64, 4, i64x4, x0, x1, x2, x3 } - -define_ty! { f32x8, f32, f32, f32, f32, f32, f32, f32, f32 } -define_impl! { f32x8, f32, 8, i32x8, x0, x1, x2, x3, x4, x5, x6, x7 } - -define_ty! { u64x4, u64, u64, u64, u64 } -define_impl! { u64x4, u64, 4, i64x4, x0, x1, x2, x3 } - -define_ty! { i64x4, i64, i64, i64, i64 } -define_impl! { i64x4, i64, 4, i64x4, x0, x1, x2, x3 } - -define_ty! { u32x8, u32, u32, u32, u32, u32, u32, u32, u32 } -define_impl! { u32x8, u32, 8, i32x8, x0, x1, x2, x3, x4, x5, x6, x7 } - -define_ty! { i32x8, i32, i32, i32, i32, i32, i32, i32, i32 } -define_impl! { i32x8, i32, 8, i32x8, x0, x1, x2, x3, x4, x5, x6, x7 } - -define_ty! { - u16x16, - u16, u16, u16, u16, u16, u16, u16, u16, - u16, u16, u16, u16, u16, u16, u16, u16 -} -define_impl! { - u16x16, u16, 16, i16x16, - x0, x1, x2, x3, x4, x5, x6, x7, - x8, x9, x10, x11, x12, x13, x14, x15 -} - -define_ty! { - i16x16, - i16, i16, i16, i16, i16, i16, i16, i16, - i16, i16, i16, i16, i16, i16, i16, i16 -} -define_impl! { - i16x16, i16, 16, i16x16, - x0, x1, x2, x3, x4, x5, x6, x7, - x8, x9, x10, x11, x12, x13, x14, x15 -} - -define_ty! { - u8x32, - u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, - u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8 -} -define_impl! { - u8x32, u8, 32, i8x32, - x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, - x16, x17, x18, x19, x20, x21, x22, x23, - x24, x25, x26, x27, x28, x29, x30, x31 -} - -define_ty! { - i8x32, - i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, - i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 -} -define_impl! { - i8x32, i8, 32, i8x32, - x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, - x16, x17, x18, x19, x20, x21, x22, x23, - x24, x25, x26, x27, x28, x29, x30, x31 -} - -define_from!(u64x4, i64x4, u32x8, i32x8, u16x16, i16x16, u8x32, i8x32); -define_from!(i64x4, u64x4, u32x8, i32x8, u16x16, i16x16, u8x32, i8x32); -define_from!(u32x8, u64x4, i64x4, i32x8, u16x16, i16x16, u8x32, i8x32); -define_from!(i32x8, u64x4, i64x4, u32x8, u16x16, i16x16, u8x32, i8x32); -define_from!(u16x16, u64x4, i64x4, u32x8, i32x8, i16x16, u8x32, i8x32); -define_from!(i16x16, u64x4, i64x4, u32x8, i32x8, u16x16, u8x32, i8x32); -define_from!(u8x32, u64x4, i64x4, u32x8, i32x8, u16x16, i16x16, i8x32); -define_from!(i8x32, u64x4, i64x4, u32x8, i32x8, u16x16, i16x16, u8x32); - -define_common_ops!( - f64x4, - f32x8, - u64x4, - i64x4, - u32x8, - i32x8, - u16x16, - i16x16, - u8x32, - i8x32 -); -define_float_ops!(f64x4, f32x8); -define_integer_ops!( - (u64x4, u64), - (i64x4, i64), - (u32x8, u32), - (i32x8, i32), - (u16x16, u16), - (i16x16, i16), - (u8x32, u8), - (i8x32, i8) -); -define_signed_integer_ops!(i64x4, i32x8, i16x16, i8x32); -define_casts!( - (f64x4, f32x4, as_f32x4), - (f64x4, u64x4, as_u64x4), - (f64x4, i64x4, as_i64x4), - (f32x8, u32x8, as_u32x8), - (f32x8, i32x8, as_i32x8), - (u64x4, f64x4, as_f64x4), - (u64x4, i64x4, as_i64x4), - (i64x4, f64x4, as_f64x4), - (i64x4, u64x4, as_u64x4), - (u32x8, f32x8, as_f32x8), - (u32x8, i32x8, as_i32x8), - (i32x8, f32x8, as_f32x8), - (i32x8, u32x8, as_u32x8), - (u16x16, i16x16, as_i16x16), - (i16x16, u16x16, as_u16x16), - (u8x32, i8x32, as_i8x32), - (i8x32, u8x32, as_u8x32) -); - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn operators() { - test_ops_si!(i8x32, i16x16, i32x8, i64x4); - test_ops_ui!(u8x32, u16x16, u32x8, u64x4); - test_ops_f!(f32x8, f64x4); - } -} diff --git a/coresimd/v512.rs b/coresimd/v512.rs deleted file mode 100644 index 351bef9f02..0000000000 --- a/coresimd/v512.rs +++ /dev/null @@ -1,180 +0,0 @@ -//! 512-bit wide vector types - -use prelude::v1::*; - -use coresimd::simd_llvm::*; - -define_ty! { f64x8, f64, f64, f64, f64, f64, f64, f64, f64 } -define_impl! { f64x8, f64, 8, i64x8, x0, x1, x2, x3, x4, x5, x6, x7 } - -define_ty! { - f32x16, - f32, f32, f32, f32, f32, f32, f32, f32, - f32, f32, f32, f32, f32, f32, f32, f32 -} -define_impl! { - f32x16, f32, 16, i32x16, - x0, x1, x2, x3, x4, x5, x6, x7, - x8, x9, x10, x11, x12, x13, x14, x15 -} - -define_ty! { u64x8, u64, u64, u64, u64, u64, u64, u64, u64 } -define_impl! { u64x8, u64, 8, i64x8, x0, x1, x2, x3, x4, x5, x6, x7 } - -define_ty! { i64x8, i64, i64, i64, i64, i64, i64, i64, i64 } -define_impl! { i64x8, i64, 8, i64x8, x0, x1, x2, x3, x4, x5, x6, x7 } - -define_ty! { - u32x16, - u32, u32, u32, u32, u32, u32, u32, u32, - u32, u32, u32, u32, u32, u32, u32, u32 -} -define_impl! { - u32x16, u32, 16, i32x16, - x0, x1, x2, x3, x4, x5, x6, x7, - x8, x9, x10, x11, x12, x13, x14, x15 -} - -define_ty! { - i32x16, - i32, i32, i32, i32, i32, i32, i32, i32, - i32, i32, i32, i32, i32, i32, i32, i32 -} -define_impl! { - i32x16, i32, 16, i32x16, - x0, x1, x2, x3, x4, x5, x6, x7, - x8, x9, x10, x11, x12, x13, x14, x15 -} - -define_ty! { - u16x32, - u16, u16, u16, u16, u16, u16, u16, u16, - u16, u16, u16, u16, u16, u16, u16, u16, - u16, u16, u16, u16, u16, u16, u16, u16, - u16, u16, u16, u16, u16, u16, u16, u16 -} -define_impl! { - u16x32, u16, 32, i16x32, - x0, x1, x2, x3, x4, x5, x6, x7, - x8, x9, x10, x11, x12, x13, x14, x15, - x16, x17, x18, x19, x20, x21, x22, x23, - x24, x25, x26, x27, x28, x29, x30, x31 -} - -define_ty! { - i16x32, - i16, i16, i16, i16, i16, i16, i16, i16, - i16, i16, i16, i16, i16, i16, i16, i16, - i16, i16, i16, i16, i16, i16, i16, i16, - i16, i16, i16, i16, i16, i16, i16, i16 -} -define_impl! { - i16x32, i16, 32, i16x32, - x0, x1, x2, x3, x4, x5, x6, x7, - x8, x9, x10, x11, x12, x13, x14, x15, - x16, x17, x18, x19, x20, x21, x22, x23, - x24, x25, x26, x27, x28, x29, x30, x31 -} - -define_ty! { - u8x64, - u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, - u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, - u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, - u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8 -} -define_impl! { - u8x64, u8, 64, i8x64, - x0, x1, x2, x3, x4, x5, x6, x7, - x8, x9, x10, x11, x12, x13, x14, x15, - x16, x17, x18, x19, x20, x21, x22, x23, - x24, x25, x26, x27, x28, x29, x30, x31, - x32, x33, x34, x35, x36, x37, x38, x39, - x40, x41, x42, x43, x44, x45, x46, x47, - x48, x49, x50, x51, x52, x53, x54, x55, - x56, x57, x58, x59, x60, x61, x62, x63 -} - -define_ty! { - i8x64, - i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, - i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, - i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, - i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 -} -define_impl! { - i8x64, i8, 64, i8x64, - x0, x1, x2, x3, x4, x5, x6, x7, - x8, x9, x10, x11, x12, x13, x14, x15, - x16, x17, x18, x19, x20, x21, x22, x23, - x24, x25, x26, x27, x28, x29, x30, x31, - x32, x33, x34, x35, x36, x37, x38, x39, - x40, x41, x42, x43, x44, x45, x46, x47, - x48, x49, x50, x51, x52, x53, x54, x55, - x56, x57, x58, x59, x60, x61, x62, x63 -} - -define_from!(u64x8, i64x8, u32x16, i32x16, u16x32, i16x32, u8x64, i8x64); -define_from!(i64x8, u64x8, u32x16, i32x16, u16x32, i16x32, u8x64, i8x64); -define_from!(u32x16, u64x8, i64x8, i32x16, u16x32, i16x32, u8x64, i8x64); -define_from!(i32x16, u64x8, i64x8, u32x16, u16x32, i16x32, u8x64, i8x64); -define_from!(u16x32, u64x8, i64x8, u32x16, i32x16, i16x32, u8x64, i8x64); -define_from!(i16x32, u64x8, i64x8, u32x16, i32x16, u16x32, u8x64, i8x64); -define_from!(u8x64, u64x8, i64x8, u32x16, i32x16, u16x32, i16x32, i8x64); -define_from!(i8x64, u64x8, i64x8, u32x16, i32x16, u16x32, i16x32, u8x64); - -define_common_ops!( - f64x8, - f32x16, - u64x8, - i64x8, - u32x16, - i32x16, - u16x32, - i16x32, - u8x64, - i8x64 -); -define_float_ops!(f64x8, f32x16); -define_integer_ops!( - (u64x8, u64), - (i64x8, i64), - (u32x16, u32), - (i32x16, i32), - (u16x32, u16), - (i16x32, i16), - (u8x64, u8), - (i8x64, i8) -); -define_signed_integer_ops!(i64x8, i32x16, i16x32, i8x64); -define_casts!( - (f64x8, f32x8, as_f32x8), - (f64x8, u64x8, as_u64x8), - (f64x8, i64x8, as_i64x8), - (f32x16, u32x16, as_u32x16), - (f32x16, i32x16, as_i32x16), - (u64x8, f64x8, as_f64x8), - (u64x8, i64x8, as_i64x8), - (i64x8, f64x8, as_f64x8), - (i64x8, u64x8, as_u64x8), - (u32x16, f32x16, as_f32x16), - (u32x16, i32x16, as_i32x16), - (i32x16, f32x16, as_f32x16), - (i32x16, u32x16, as_u32x16), - (u16x32, i16x32, as_i16x32), - (i16x32, u16x32, as_u16x32), - (u8x64, i8x64, as_i8x64), - (i8x64, u8x64, as_u8x64) -); - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn operators() { - test_ops_si!(i8x64, i16x32, i32x16, i64x8); - test_ops_ui!(u8x64, u16x32, u32x16, u64x8); - test_ops_f!(f32x16, f64x8); - } -} diff --git a/coresimd/x86/aes.rs b/coresimd/x86/aes.rs index 6cb618343c..9ced4a2022 100644 --- a/coresimd/x86/aes.rs +++ b/coresimd/x86/aes.rs @@ -60,7 +60,7 @@ pub unsafe fn _mm_aesenclast_si128(a: __m128i, round_key: __m128i) -> __m128i { aesenclast(a, round_key) } -/// Perform the “InvMixColumns” transformation on `a`. +/// Perform the `InvMixColumns` transformation on `a`. #[inline] #[target_feature(enable = "aes")] #[cfg_attr(test, assert_instr(aesimc))] diff --git a/coresimd/x86/avx.rs b/coresimd/x86/avx.rs index 81cc7dff98..0686369481 100644 --- a/coresimd/x86/avx.rs +++ b/coresimd/x86/avx.rs @@ -14,8 +14,7 @@ //! [wiki]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions use coresimd::simd_llvm::*; -use coresimd::v128::*; -use coresimd::v256::*; +use coresimd::simd::*; use coresimd::x86::*; use intrinsics; use mem; diff --git a/coresimd/x86/avx2.rs b/coresimd/x86/avx2.rs index 7cba6aa7cf..763ccf8614 100644 --- a/coresimd/x86/avx2.rs +++ b/coresimd/x86/avx2.rs @@ -19,10 +19,7 @@ //! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate use coresimd::simd_llvm::*; -use coresimd::v256::*; -use coresimd::v128::*; -use coresimd::v64::*; -use coresimd::v32::*; +use coresimd::simd::*; use coresimd::x86::*; use mem; @@ -142,102 +139,144 @@ pub unsafe fn _mm256_alignr_epi8(a: __m256i, b: __m256i, n: i32) -> __m256i { let b = b.as_i8x32(); let r: i8x32 = match n { - 0 => { - simd_shuffle32(b, a, [ - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, - ]) - } - 1 => { - simd_shuffle32(b, a, [ - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, - 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, - ]) - } - 2 => { - simd_shuffle32(b, a, [ - 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, - 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, - ]) - } - 3 => { - simd_shuffle32(b, a, [ - 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, - 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, - ]) - } - 4 => { - simd_shuffle32(b, a, [ - 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, - 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, - ]) - } - 5 => { - simd_shuffle32(b, a, [ - 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, - 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, - ]) - } - 6 => { - simd_shuffle32(b, a, [ + 0 => simd_shuffle32( + b, + a, + [ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + ], + ), + 1 => simd_shuffle32( + b, + a, + [ + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 17, 18, + 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, + ], + ), + 2 => simd_shuffle32( + b, + a, + [ + 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 18, + 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, + ], + ), + 3 => simd_shuffle32( + b, + a, + [ + 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, + ], + ), + 4 => simd_shuffle32( + b, + a, + [ + 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 20, + 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, + ], + ), + 5 => simd_shuffle32( + b, + a, + [ + 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 21, + 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, + ], + ), + 6 => simd_shuffle32( + b, + a, + [ 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, - 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, - ]) - } - 7 => { - simd_shuffle32(b, a, [ + 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, + 53, + ], + ), + 7 => simd_shuffle32( + b, + a, + [ 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, - 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, - ]) - } - 8 => { - simd_shuffle32(b, a, [ + 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, + 54, + ], + ), + 8 => simd_shuffle32( + b, + a, + [ 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, - 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, - ]) - } - 9 => { - simd_shuffle32(b, a, [ + 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, + 55, + ], + ), + 9 => simd_shuffle32( + b, + a, + [ 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, - 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, - ]) - } - 10 => { - simd_shuffle32(b, a, [ - 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, - 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, - ]) - } - 11 => { - simd_shuffle32(b, a, [ - 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, - 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, - ]) - } - 12 => { - simd_shuffle32(b, a, [ - 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, - 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, - ]) - } - 13 => { - simd_shuffle32(b, a, [ - 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, - 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, - ]) - } - 14 => { - simd_shuffle32(b, a, [ - 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, - 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, - ]) - } - 15 => { - simd_shuffle32(b, a, [ - 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, - 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, - ]) - } + 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, + 56, + ], + ), + 10 => simd_shuffle32( + b, + a, + [ + 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, + 41, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, + ], + ), + 11 => simd_shuffle32( + b, + a, + [ + 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, + 42, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, + 57, 58, + ], + ), + 12 => simd_shuffle32( + b, + a, + [ + 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, + 43, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, + 58, 59, + ], + ), + 13 => simd_shuffle32( + b, + a, + [ + 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, + 44, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, + 59, 60, + ], + ), + 14 => simd_shuffle32( + b, + a, + [ + 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, + ], + ), + 15 => simd_shuffle32( + b, + a, + [ + 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, + 46, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, + 61, 62, + ], + ), _ => b, }; mem::transmute(r) diff --git a/coresimd/x86/mmx.rs b/coresimd/x86/mmx.rs index 411ce12063..a8ba18f998 100644 --- a/coresimd/x86/mmx.rs +++ b/coresimd/x86/mmx.rs @@ -8,7 +8,7 @@ //! //! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf -use coresimd::v64::*; +use coresimd::simd::*; use coresimd::x86::*; use mem; diff --git a/coresimd/x86/mod.rs b/coresimd/x86/mod.rs index 1770a719fe..9195e58be0 100644 --- a/coresimd/x86/mod.rs +++ b/coresimd/x86/mod.rs @@ -359,42 +359,42 @@ pub(crate) trait m128iExt: Sized { fn as_m128i(self) -> __m128i; #[inline] - fn as_u8x16(self) -> ::coresimd::v128::u8x16 { + fn as_u8x16(self) -> ::coresimd::simd::u8x16 { unsafe { mem::transmute(self.as_m128i()) } } #[inline] - fn as_u16x8(self) -> ::coresimd::v128::u16x8 { + fn as_u16x8(self) -> ::coresimd::simd::u16x8 { unsafe { mem::transmute(self.as_m128i()) } } #[inline] - fn as_u32x4(self) -> ::coresimd::v128::u32x4 { + fn as_u32x4(self) -> ::coresimd::simd::u32x4 { unsafe { mem::transmute(self.as_m128i()) } } #[inline] - fn as_u64x2(self) -> ::coresimd::v128::u64x2 { + fn as_u64x2(self) -> ::coresimd::simd::u64x2 { unsafe { mem::transmute(self.as_m128i()) } } #[inline] - fn as_i8x16(self) -> ::coresimd::v128::i8x16 { + fn as_i8x16(self) -> ::coresimd::simd::i8x16 { unsafe { mem::transmute(self.as_m128i()) } } #[inline] - fn as_i16x8(self) -> ::coresimd::v128::i16x8 { + fn as_i16x8(self) -> ::coresimd::simd::i16x8 { unsafe { mem::transmute(self.as_m128i()) } } #[inline] - fn as_i32x4(self) -> ::coresimd::v128::i32x4 { + fn as_i32x4(self) -> ::coresimd::simd::i32x4 { unsafe { mem::transmute(self.as_m128i()) } } #[inline] - fn as_i64x2(self) -> ::coresimd::v128::i64x2 { + fn as_i64x2(self) -> ::coresimd::simd::i64x2 { unsafe { mem::transmute(self.as_m128i()) } } } @@ -412,42 +412,42 @@ pub(crate) trait m256iExt: Sized { fn as_m256i(self) -> __m256i; #[inline] - fn as_u8x32(self) -> ::coresimd::v256::u8x32 { + fn as_u8x32(self) -> ::coresimd::simd::u8x32 { unsafe { mem::transmute(self.as_m256i()) } } #[inline] - fn as_u16x16(self) -> ::coresimd::v256::u16x16 { + fn as_u16x16(self) -> ::coresimd::simd::u16x16 { unsafe { mem::transmute(self.as_m256i()) } } #[inline] - fn as_u32x8(self) -> ::coresimd::v256::u32x8 { + fn as_u32x8(self) -> ::coresimd::simd::u32x8 { unsafe { mem::transmute(self.as_m256i()) } } #[inline] - fn as_u64x4(self) -> ::coresimd::v256::u64x4 { + fn as_u64x4(self) -> ::coresimd::simd::u64x4 { unsafe { mem::transmute(self.as_m256i()) } } #[inline] - fn as_i8x32(self) -> ::coresimd::v256::i8x32 { + fn as_i8x32(self) -> ::coresimd::simd::i8x32 { unsafe { mem::transmute(self.as_m256i()) } } #[inline] - fn as_i16x16(self) -> ::coresimd::v256::i16x16 { + fn as_i16x16(self) -> ::coresimd::simd::i16x16 { unsafe { mem::transmute(self.as_m256i()) } } #[inline] - fn as_i32x8(self) -> ::coresimd::v256::i32x8 { + fn as_i32x8(self) -> ::coresimd::simd::i32x8 { unsafe { mem::transmute(self.as_m256i()) } } #[inline] - fn as_i64x4(self) -> ::coresimd::v256::i64x4 { + fn as_i64x4(self) -> ::coresimd::simd::i64x4 { unsafe { mem::transmute(self.as_m256i()) } } } @@ -459,6 +459,99 @@ impl m256iExt for __m256i { } } +use coresimd::simd::{b8x32, b8x16, b8x8, + f32x4, f32x8, f64x2, f64x4, i16x16, + i16x4, i16x8, i32x2, i32x4, i32x8, i64x2, i64x4, i8x16, + i8x32, i8x8, u16x16, u16x4, u16x8, u32x2, u32x4, u32x8, + u64x2, u64x4, u8x16, u8x32, u8x8}; + +impl_from_bits_!( + __m64: u32x2, + i32x2, + u16x4, + i16x4, + u8x8, + i8x8, + b8x8 +); +impl_from_bits_!( + __m128: u64x2, + i64x2, + f64x2, + u32x4, + i32x4, + f32x4, + u16x8, + i16x8, + u8x16, + i8x16, + b8x16 +); +impl_from_bits_!( + __m128i: u64x2, + i64x2, + f64x2, + u32x4, + i32x4, + f32x4, + u16x8, + i16x8, + u8x16, + i8x16, + b8x16 +); +impl_from_bits_!( + __m128d: u64x2, + i64x2, + f64x2, + u32x4, + i32x4, + f32x4, + u16x8, + i16x8, + u8x16, + i8x16, + b8x16 +); +impl_from_bits_!( + __m256: u64x4, + i64x4, + f64x4, + u32x8, + i32x8, + f32x8, + u16x16, + i16x16, + u8x32, + i8x32, + b8x32 +); +impl_from_bits_!( + __m256i: u64x4, + i64x4, + f64x4, + u32x8, + i32x8, + f32x8, + u16x16, + i16x16, + u8x32, + i8x32, + b8x32 +); +impl_from_bits_!( + __m256d: u64x4, + i64x4, + f64x4, + u32x8, + i32x8, + f32x8, + u16x16, + i16x16, + u8x32, + i8x32, + b8x32 +); mod eflags; pub use self::eflags::*; diff --git a/coresimd/x86/pclmulqdq.rs b/coresimd/x86/pclmulqdq.rs index 8c4f4b8574..b36f69cbc0 100644 --- a/coresimd/x86/pclmulqdq.rs +++ b/coresimd/x86/pclmulqdq.rs @@ -23,20 +23,26 @@ extern "C" { /// should be used. Immediate bits other than 0 and 4 are ignored. #[inline] #[target_feature(enable = "pclmulqdq")] -#[cfg_attr(all(test, not(target_os="linux")), assert_instr(pclmulqdq, imm8 = 0))] -#[cfg_attr(all(test, target_os="linux"), assert_instr(pclmullqlqdq, imm8 = 0))] -#[cfg_attr(all(test, target_os="linux"), assert_instr(pclmulhqlqdq, imm8 = 1))] -#[cfg_attr(all(test, target_os="linux"), assert_instr(pclmullqhqdq, imm8 = 16))] -#[cfg_attr(all(test, target_os="linux"), assert_instr(pclmulhqhqdq, imm8 = 17))] +#[cfg_attr(all(test, not(target_os = "linux")), + assert_instr(pclmulqdq, imm8 = 0))] +#[cfg_attr(all(test, target_os = "linux"), + assert_instr(pclmullqlqdq, imm8 = 0))] +#[cfg_attr(all(test, target_os = "linux"), + assert_instr(pclmulhqlqdq, imm8 = 1))] +#[cfg_attr(all(test, target_os = "linux"), + assert_instr(pclmullqhqdq, imm8 = 16))] +#[cfg_attr(all(test, target_os = "linux"), + assert_instr(pclmulhqhqdq, imm8 = 17))] #[rustc_args_required_const(2)] -pub unsafe fn _mm_clmulepi64_si128(a: __m128i, b: __m128i, imm8: i32) -> __m128i { +pub unsafe fn _mm_clmulepi64_si128( + a: __m128i, b: __m128i, imm8: i32 +) -> __m128i { macro_rules! call { ($imm8:expr) => (pclmulqdq(a, b, $imm8)) } constify_imm8!(imm8, call) } - #[cfg(test)] mod tests { // The constants in the tests below are just bit patterns. They should not diff --git a/coresimd/x86/rdrand.rs b/coresimd/x86/rdrand.rs index e15da28e0e..9877125851 100644 --- a/coresimd/x86/rdrand.rs +++ b/coresimd/x86/rdrand.rs @@ -1,6 +1,6 @@ //! RDRAND and RDSEED instructions for returning random numbers from an Intel -//! on-chip hardware random number generator which has been seeded by an on-chip -//! entropy source. +//! on-chip hardware random number generator which has been seeded by an +//! on-chip entropy source. extern "platform-intrinsic" { fn x86_rdrand16_step() -> (u16, i32); @@ -17,6 +17,7 @@ use stdsimd_test::assert_instr; #[inline] #[target_feature(enable = "rdrand")] #[cfg_attr(test, assert_instr(rdrand))] +#[cfg_attr(feature = "cargo-clippy", allow(stutter))] pub unsafe fn _rdrand16_step(val: &mut u16) -> i32 { let (v, flag) = x86_rdrand16_step(); *val = v; @@ -28,6 +29,7 @@ pub unsafe fn _rdrand16_step(val: &mut u16) -> i32 { #[inline] #[target_feature(enable = "rdrand")] #[cfg_attr(test, assert_instr(rdrand))] +#[cfg_attr(feature = "cargo-clippy", allow(stutter))] pub unsafe fn _rdrand32_step(val: &mut u32) -> i32 { let (v, flag) = x86_rdrand32_step(); *val = v; diff --git a/coresimd/x86/sse.rs b/coresimd/x86/sse.rs index 1d871ab76f..6b5157149f 100644 --- a/coresimd/x86/sse.rs +++ b/coresimd/x86/sse.rs @@ -1,8 +1,7 @@ //! Streaming SIMD Extensions (SSE) use coresimd::simd_llvm::*; -use coresimd::v128::*; -use coresimd::v64::*; +use coresimd::simd::*; use coresimd::x86::*; use intrinsics; use mem; @@ -2167,12 +2166,10 @@ pub unsafe fn _mm_cvtps_pi8(a: __m128) -> __m64 { mod tests { use std::mem::transmute; use std::f32::NAN; - use stdsimd_test::simd_test; use test::black_box; // Used to inhibit constant-folding. - use coresimd::v128::*; - use coresimd::v64::*; + use coresimd::simd::*; use coresimd::x86::*; #[simd_test = "sse"] diff --git a/coresimd/x86/sse2.rs b/coresimd/x86/sse2.rs index dcfb149ee0..b16b920b59 100644 --- a/coresimd/x86/sse2.rs +++ b/coresimd/x86/sse2.rs @@ -4,8 +4,7 @@ use stdsimd_test::assert_instr; use coresimd::simd_llvm::*; -use coresimd::v128::*; -use coresimd::v64::*; +use coresimd::simd::*; use coresimd::x86::*; use intrinsics; use mem; @@ -319,6 +318,8 @@ pub unsafe fn _mm_slli_si128(a: __m128i, imm8: i32) -> __m128i { _mm_slli_si128_impl(a, imm8) } +/// Implementation detail: converts the immediate argument of the +/// `_mm_slli_si128` intrinsic into a compile-time constant. #[inline] #[target_feature(enable = "sse2")] unsafe fn _mm_slli_si128_impl(a: __m128i, imm8: i32) -> __m128i { @@ -479,6 +480,8 @@ pub unsafe fn _mm_srli_si128(a: __m128i, imm8: i32) -> __m128i { _mm_srli_si128_impl(a, imm8) } +/// Implementation detail: converts the immediate argument of the +/// `_mm_srli_si128` intrinsic into a compile-time constant. #[inline] #[target_feature(enable = "sse2")] unsafe fn _mm_srli_si128_impl(a: __m128i, imm8: i32) -> __m128i { @@ -2502,7 +2505,7 @@ mod tests { use stdsimd_test::simd_test; use test::black_box; // Used to inhibit constant-folding. use coresimd::x86::*; - use coresimd::v128::*; + use coresimd::simd::*; #[simd_test = "sse2"] unsafe fn test_mm_pause() { diff --git a/coresimd/x86/sse3.rs b/coresimd/x86/sse3.rs index b380cbe310..6272502996 100644 --- a/coresimd/x86/sse3.rs +++ b/coresimd/x86/sse3.rs @@ -1,7 +1,7 @@ //! Streaming SIMD Extensions 3 (SSE3) use coresimd::simd_llvm::{simd_shuffle2, simd_shuffle4}; -use coresimd::v128::*; +use coresimd::simd::*; use coresimd::x86::*; #[cfg(test)] diff --git a/coresimd/x86/sse41.rs b/coresimd/x86/sse41.rs index 84270c5278..e07ffd47e4 100644 --- a/coresimd/x86/sse41.rs +++ b/coresimd/x86/sse41.rs @@ -1,10 +1,7 @@ //! Streaming SIMD Extensions 4.1 (SSE4.1) use coresimd::simd_llvm::*; -use coresimd::v128::*; -use coresimd::v64::*; -use coresimd::v32::*; -use coresimd::v16::*; +use coresimd::simd::*; use coresimd::x86::*; use mem; diff --git a/coresimd/x86/sse42.rs b/coresimd/x86/sse42.rs index 75456858fe..7dd7abb1c4 100644 --- a/coresimd/x86/sse42.rs +++ b/coresimd/x86/sse42.rs @@ -6,7 +6,7 @@ use stdsimd_test::assert_instr; use coresimd::simd_llvm::*; -use coresimd::v128::*; +use coresimd::simd::*; use coresimd::x86::*; /// String contains unsigned 8-bit characters *(Default)* diff --git a/coresimd/x86/sse4a.rs b/coresimd/x86/sse4a.rs index 5db910004f..0aae55502a 100644 --- a/coresimd/x86/sse4a.rs +++ b/coresimd/x86/sse4a.rs @@ -1,6 +1,6 @@ //! `i686`'s Streaming SIMD Extensions 4a (`SSE4a`) -use coresimd::v128::*; +use coresimd::simd::*; use coresimd::x86::*; use mem; diff --git a/coresimd/x86/ssse3.rs b/coresimd/x86/ssse3.rs index 7b97443fb5..c15ad76613 100644 --- a/coresimd/x86/ssse3.rs +++ b/coresimd/x86/ssse3.rs @@ -1,7 +1,7 @@ //! Supplemental Streaming SIMD Extensions 3 (SSSE3) use coresimd::simd_llvm::simd_shuffle16; -use coresimd::v128::*; +use coresimd::simd::*; use coresimd::x86::*; use mem; diff --git a/coresimd/x86/test.rs b/coresimd/x86/test.rs index e03f3c413c..1b5b6b1fb0 100644 --- a/coresimd/x86/test.rs +++ b/coresimd/x86/test.rs @@ -134,6 +134,6 @@ mod x86_polyfill { } #[cfg(target_arch = "x86_64")] mod x86_polyfill { - pub use coresimd::x86_64::{_mm_insert_epi64, _mm256_insert_epi64}; + pub use coresimd::x86_64::{_mm256_insert_epi64, _mm_insert_epi64}; } pub use self::x86_polyfill::*; diff --git a/coresimd/x86_64/rdrand.rs b/coresimd/x86_64/rdrand.rs index b3311c2b96..917e900fef 100644 --- a/coresimd/x86_64/rdrand.rs +++ b/coresimd/x86_64/rdrand.rs @@ -1,6 +1,6 @@ //! RDRAND and RDSEED instructions for returning random numbers from an Intel -//! on-chip hardware random number generator which has been seeded by an on-chip -//! entropy source. +//! on-chip hardware random number generator which has been seeded by an +//! on-chip entropy source. extern "platform-intrinsic" { fn x86_rdrand64_step() -> (u64, i32); @@ -15,6 +15,7 @@ use stdsimd_test::assert_instr; #[inline] #[target_feature(enable = "rdrand")] #[cfg_attr(test, assert_instr(rdrand))] +#[cfg_attr(feature = "cargo-clippy", allow(stutter))] pub unsafe fn _rdrand64_step(val: &mut u64) -> i32 { let (v, flag) = x86_rdrand64_step(); *val = v; diff --git a/crates/coresimd/src/lib.rs b/crates/coresimd/src/lib.rs index becfc7f824..2400ccc082 100644 --- a/crates/coresimd/src/lib.rs +++ b/crates/coresimd/src/lib.rs @@ -1,7 +1,7 @@ //! SIMD and vendor intrinsics support library. //! -//! This documentation is for the `coresimd` crate, but you probably want to use -//! the [`stdsimd` crate][stdsimd] which should have more complete +//! This documentation is for the `coresimd` crate, but you probably want to +//! use the [`stdsimd` crate][stdsimd] which should have more complete //! documentation. //! //! [stdsimd]: https://rust-lang-nursery.github.io/stdsimd/x86_64/stdsimd/ @@ -13,8 +13,10 @@ simd_ffi, target_feature, cfg_target_feature, i128_type, asm, integer_atomics, stmt_expr_attributes, core_intrinsics, crate_in_paths, no_core, attr_literals, rustc_attrs, stdsimd, - staged_api)] -#![cfg_attr(test, feature(proc_macro, test, attr_literals, abi_vectorcall))] + staged_api, fn_must_use, core_float, core_slice_ext, align_offset)] +#![cfg_attr(test, + feature(proc_macro, test, attr_literals, abi_vectorcall, + untagged_unions))] #![cfg_attr(feature = "cargo-clippy", allow(inline_always, too_many_arguments, cast_sign_loss, cast_lossless, cast_possible_wrap, @@ -25,7 +27,8 @@ #![no_core] #![unstable(feature = "stdsimd", issue = "0")] #![doc(test(attr(deny(warnings))), - test(attr(allow(dead_code, deprecated, unused_variables, unused_mut))))] + test(attr(allow(dead_code, deprecated, unused_variables, + unused_mut))))] #[cfg_attr(not(test), macro_use)] extern crate core as _core; @@ -33,12 +36,12 @@ extern crate core as _core; #[macro_use] extern crate std; #[cfg(test)] +#[macro_use] +extern crate stdsimd; +#[cfg(test)] extern crate stdsimd_test; #[cfg(test)] extern crate test; -#[cfg(test)] -#[macro_use] -extern crate stdsimd; #[path = "../../../coresimd/mod.rs"] mod coresimd; @@ -53,8 +56,12 @@ use _core::cmp; #[allow(unused_imports)] use _core::convert; #[allow(unused_imports)] +use _core::default; +#[allow(unused_imports)] use _core::fmt; #[allow(unused_imports)] +use _core::hash; +#[allow(unused_imports)] use _core::intrinsics; #[allow(unused_imports)] use _core::iter; @@ -63,6 +70,8 @@ use _core::marker; #[allow(unused_imports)] use _core::mem; #[allow(unused_imports)] +use _core::num; +#[allow(unused_imports)] use _core::ops; #[allow(unused_imports)] use _core::option; @@ -72,3 +81,5 @@ use _core::prelude; use _core::ptr; #[allow(unused_imports)] use _core::result; +#[allow(unused_imports)] +use _core::slice; diff --git a/crates/coresimd/tests/cpu-detection.rs b/crates/coresimd/tests/cpu-detection.rs index 3cd7c580bf..eeb5a4b7b2 100644 --- a/crates/coresimd/tests/cpu-detection.rs +++ b/crates/coresimd/tests/cpu-detection.rs @@ -26,8 +26,14 @@ fn x86_all() { println!("avx512bw {:?}", is_target_feature_detected!("avx512bw")); println!("avx512dq {:?}", is_target_feature_detected!("avx512dq")); println!("avx512vl {:?}", is_target_feature_detected!("avx512vl")); - println!("avx512_ifma {:?}", is_target_feature_detected!("avx512ifma")); - println!("avx512_vbmi {:?}", is_target_feature_detected!("avx512vbmi")); + println!( + "avx512_ifma {:?}", + is_target_feature_detected!("avx512ifma") + ); + println!( + "avx512_vbmi {:?}", + is_target_feature_detected!("avx512vbmi") + ); println!( "avx512_vpopcntdq {:?}", is_target_feature_detected!("avx512vpopcntdq") diff --git a/crates/stdsimd-verify/src/lib.rs b/crates/stdsimd-verify/src/lib.rs index ddfa7adab9..1fad214406 100644 --- a/crates/stdsimd-verify/src/lib.rs +++ b/crates/stdsimd-verify/src/lib.rs @@ -241,7 +241,8 @@ fn find_target_feature(attrs: &[syn::Attribute]) -> Option { } fn find_required_const(attrs: &[syn::Attribute]) -> Vec { - attrs.iter() + attrs + .iter() .filter(|a| a.path.segments[0].ident == "rustc_args_required_const") .map(|a| a.tts.clone()) .map(|a| syn::parse::(a.into()).unwrap()) diff --git a/crates/stdsimd-verify/tests/x86-intel.rs b/crates/stdsimd-verify/tests/x86-intel.rs index a709108547..bc9ef5f948 100644 --- a/crates/stdsimd-verify/tests/x86-intel.rs +++ b/crates/stdsimd-verify/tests/x86-intel.rs @@ -312,7 +312,9 @@ fn matches(rust: &Function, intel: &Intrinsic) -> Result<(), String> { if rust.arguments.len() != intel.parameters.len() { bail!("wrong number of arguments on {}", rust.name) } - for (i, (a, b)) in intel.parameters.iter().zip(rust.arguments).enumerate() { + for (i, (a, b)) in + intel.parameters.iter().zip(rust.arguments).enumerate() + { let is_const = rust.required_const.contains(&i); equate(b, &a.type_, &intel.name, is_const)?; } @@ -353,15 +355,14 @@ fn matches(rust: &Function, intel: &Intrinsic) -> Result<(), String> { Ok(()) } -fn equate(t: &Type, - intel: &str, - intrinsic: &str, - is_const: bool) -> Result<(), String> { +fn equate( + t: &Type, intel: &str, intrinsic: &str, is_const: bool +) -> Result<(), String> { let intel = intel.replace(" *", "*"); let intel = intel.replace(" const*", "*"); let require_const = || { if is_const { - return Ok(()) + return Ok(()); } Err(format!("argument required to be const but isn't")) }; diff --git a/crates/stdsimd/src/lib.rs b/crates/stdsimd/src/lib.rs index 16748a3ba4..0c153b35b9 100644 --- a/crates/stdsimd/src/lib.rs +++ b/crates/stdsimd/src/lib.rs @@ -8,15 +8,16 @@ //! [stdsimd]: https://rust-lang-nursery.github.io/stdsimd/x86_64/stdsimd/ #![feature(const_fn, integer_atomics, staged_api, stdsimd)] +#![cfg_attr(feature = "cargo-clippy", allow(shadow_reuse))] #![cfg_attr(target_os = "linux", feature(linkage))] #![no_std] #![unstable(feature = "stdsimd", issue = "0")] -extern crate std as _std; -extern crate coresimd; -extern crate libc; #[macro_use] extern crate cfg_if; +extern crate coresimd; +extern crate libc; +extern crate std as _std; #[cfg(test)] #[macro_use] diff --git a/crates/stdsimd/tests/cpu-detection.rs b/crates/stdsimd/tests/cpu-detection.rs index 97fa930872..8596536f17 100644 --- a/crates/stdsimd/tests/cpu-detection.rs +++ b/crates/stdsimd/tests/cpu-detection.rs @@ -59,8 +59,14 @@ fn x86_all() { println!("avx512bw {:?}", is_target_feature_detected!("avx512bw")); println!("avx512dq {:?}", is_target_feature_detected!("avx512dq")); println!("avx512vl {:?}", is_target_feature_detected!("avx512vl")); - println!("avx512_ifma {:?}", is_target_feature_detected!("avx512ifma")); - println!("avx512_vbmi {:?}", is_target_feature_detected!("avx512vbmi")); + println!( + "avx512_ifma {:?}", + is_target_feature_detected!("avx512ifma") + ); + println!( + "avx512_vbmi {:?}", + is_target_feature_detected!("avx512vbmi") + ); println!( "avx512_vpopcntdq {:?}", is_target_feature_detected!("avx512vpopcntdq") diff --git a/examples/hex.rs b/examples/hex.rs index 6a0d4c1135..fa0cc2685f 100644 --- a/examples/hex.rs +++ b/examples/hex.rs @@ -14,10 +14,18 @@ #![feature(cfg_target_feature, target_feature, stdsimd)] #![cfg_attr(test, feature(test))] +#![cfg_attr(feature = "cargo-clippy", + allow(result_unwrap_used, print_stdout, option_unwrap_used, + shadow_reuse, cast_possible_wrap, cast_sign_loss, + missing_docs_in_private_items))] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[macro_use] extern crate stdsimd; +#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))] +extern crate stdsimd; + #[cfg(test)] #[macro_use] extern crate quickcheck; diff --git a/examples/nbody.rs b/examples/nbody.rs index a8e1927888..4abe850d8e 100644 --- a/examples/nbody.rs +++ b/examples/nbody.rs @@ -11,8 +11,7 @@ shadow_reuse, print_stdout))] extern crate stdsimd; -use self::stdsimd::simd; -use simd::f64x2; +use stdsimd::simd::*; const PI: f64 = std::f64::consts::PI; const SOLAR_MASS: f64 = 4.0 * PI * PI; @@ -31,17 +30,16 @@ impl Frsqrt for f64x2 { use stdsimd::arch::x86::*; #[cfg(target_arch = "x86_64")] use stdsimd::arch::x86_64::*; + let t: f32x2 = (*self).into(); - let t = self.as_f32x2(); - - let u = unsafe { + let u: f64x4 = unsafe { let res = _mm_rsqrt_ps(_mm_setr_ps( t.extract(0), t.extract(1), 0., 0., )); - std::mem::transmute::<_, simd::f32x4>(res).as_f64x4() + f32x4::from_bits(res).into() }; Self::new(u.extract(0), u.extract(1)) } @@ -53,7 +51,7 @@ impl Frsqrt for f64x2 { #[cfg(target_arch = "aarch64")] use stdsimd::arch::aarch64::*; - unsafe { vrsqrte_f32(self.as_f32x2()).as_f64x2() } + unsafe { vrsqrte_f32((*self).into()).into() } } #[cfg(not(any(all(any(target_arch = "x86", target_arch = "x86_64"), @@ -84,7 +82,7 @@ impl Body { x: [x0, x1, x2], _fill: 0.0, v: [v0, v1, v2], - mass: mass, + mass, } } } @@ -133,7 +131,7 @@ fn advance(bodies: &mut [Body; N_BODIES], dt: f64) { * (distance * distance) } dmag = f64x2::splat(dt) / dsquared * distance; - dmag.store(&mut mag, i); + dmag.store_unaligned(&mut mag[i..]); i += 2; } diff --git a/stdsimd/arch/detect/cache.rs b/stdsimd/arch/detect/cache.rs index ae9fe64f8c..1473e4fc27 100644 --- a/stdsimd/arch/detect/cache.rs +++ b/stdsimd/arch/detect/cache.rs @@ -23,6 +23,7 @@ pub const fn test_bit(x: u64, bit: u32) -> bool { const CACHE_CAPACITY: u32 = 63; /// This type is used to initialize the cache +#[derive(Copy, Clone)] pub struct Initializer(u64); impl Default for Initializer { diff --git a/stdsimd/arch/detect/linux.rs b/stdsimd/arch/detect/linux.rs index c331d8a5dc..71373667e2 100644 --- a/stdsimd/arch/detect/linux.rs +++ b/stdsimd/arch/detect/linux.rs @@ -1,4 +1,3 @@ - #![allow(dead_code)] use core::mem; @@ -43,9 +42,10 @@ pub struct AuxVec { /// /// [auxvec_h]: https://github.com/torvalds/linux/blob/master/include/uapi/linux/auxvec.h /// [auxv_docs]: https://docs.rs/auxv/0.3.3/auxv/ +#[cfg_attr(feature = "cargo-clippy", allow(items_after_statements))] pub fn auxv() -> Result { if !cfg!(target_os = "linux") { - return Err(()) + return Err(()); } if let Ok(hwcap) = getauxval(AT_HWCAP) { #[cfg(target_arch = "aarch64")] @@ -78,7 +78,10 @@ pub fn auxv() -> Result { pub type F = unsafe extern "C" fn(usize) -> usize; unsafe { - let ptr = libc::dlsym(libc::RTLD_DEFAULT, "getauxval\0".as_ptr() as *const _); + let ptr = libc::dlsym( + libc::RTLD_DEFAULT, + "getauxval\0".as_ptr() as *const _, + ); if ptr.is_null() { return Err(()); } @@ -97,7 +100,7 @@ fn auxv_from_file(file: &str) -> Result { // The auxiliary vector contains at most 32 (key,value) fields: from // `AT_EXECFN = 31` to `AT_NULL = 0`. That is, a buffer of // 2*32 `usize` elements is enough to read the whole vector. - let mut buf = [0usize; 64]; + let mut buf = [0_usize; 64]; { let raw: &mut [u8; 64 * mem::size_of::()] = unsafe { mem::transmute(&mut buf) }; @@ -128,15 +131,12 @@ fn auxv_from_buf(buf: &[usize; 64]) -> Result { _ => (), } } - if hwcap.is_some() && hwcap2.is_some() { - return Ok(AuxVec { - hwcap: hwcap.unwrap(), - hwcap2: hwcap2.unwrap(), - }); + + if let (Some(hwcap), Some(hwcap2)) = (hwcap, hwcap2) { + return Ok(AuxVec { hwcap, hwcap2 }); } } - drop(buf); Err(()) } @@ -147,9 +147,9 @@ pub struct CpuInfo { impl CpuInfo { /// Reads /proc/cpuinfo into CpuInfo. - pub fn new() -> Result { + pub fn new() -> Result { let mut file = File::open("/proc/cpuinfo")?; - let mut cpui = CpuInfo { raw: String::new() }; + let mut cpui = Self { raw: String::new() }; file.read_to_string(&mut cpui.raw)?; Ok(cpui) } @@ -157,7 +157,7 @@ impl CpuInfo { pub fn field(&self, field: &str) -> CpuInfoField { for l in self.raw.lines() { if l.trim().starts_with(field) { - return CpuInfoField::new(l.split(": ").skip(1).next()); + return CpuInfoField::new(l.split(": ").nth(1)); } } CpuInfoField(None) @@ -170,8 +170,8 @@ impl CpuInfo { } #[cfg(test)] - fn from_str(other: &str) -> Result { - Ok(CpuInfo { + fn from_str(other: &str) -> Result { + Ok(Self { raw: String::from(other), }) } @@ -184,7 +184,7 @@ pub struct CpuInfoField<'a>(Option<&'a str>); impl<'a> PartialEq<&'a str> for CpuInfoField<'a> { fn eq(&self, other: &&'a str) -> bool { match self.0 { - None => other.len() == 0, + None => other.is_empty(), Some(f) => f == other.trim(), } } @@ -205,10 +205,10 @@ impl<'a> CpuInfoField<'a> { /// Does the field contain `other`? pub fn has(&self, other: &str) -> bool { match self.0 { - None => other.len() == 0, + None => other.is_empty(), Some(f) => { let other = other.trim(); - for v in f.split(" ") { + for v in f.split(' ') { if v == other { return true; } @@ -259,10 +259,10 @@ mod tests { #[test] fn auxv_crate() { - if cfg!(target_arch = "x86") || - cfg!(target_arch = "x86_64") || - cfg!(target_arch = "powerpc") { - return + if cfg!(target_arch = "x86") || cfg!(target_arch = "x86_64") + || cfg!(target_arch = "powerpc") + { + return; } let v = auxv(); if let Some(hwcap) = auxv_crate_getauxval(AT_HWCAP) { @@ -280,8 +280,9 @@ mod tests { #[cfg(target_arch = "arm")] #[test] fn linux_rpi3() { - let v = auxv_from_file("../../stdsimd/arch/detect/test_data/linux-rpi3.auxv") - .unwrap(); + let v = auxv_from_file( + "../../stdsimd/arch/detect/test_data/linux-rpi3.auxv", + ).unwrap(); assert_eq!(v.hwcap, 4174038); assert_eq!(v.hwcap2, 16); } @@ -320,7 +321,7 @@ mod tests { #[test] fn auxv_crate_procfs() { if cfg!(target_arch = "x86") || cfg!(target_arch = "x86_64") { - return + return; } let v = auxv(); if let Some(hwcap) = auxv_crate_getprocfs(AT_HWCAP) { diff --git a/stdsimd/arch/detect/x86.rs b/stdsimd/arch/detect/x86.rs index a49a6c405b..7705cb9934 100644 --- a/stdsimd/arch/detect/x86.rs +++ b/stdsimd/arch/detect/x86.rs @@ -273,6 +273,7 @@ pub enum Feature { /// [wiki_cpuid]: https://en.wikipedia.org/wiki/CPUID /// [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf /// [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf +#[cfg_attr(feature = "cargo-clippy", allow(similar_names))] pub fn detect_features() -> cache::Initializer { let mut value = cache::Initializer::default(); diff --git a/stdsimd/mod.rs b/stdsimd/mod.rs index 11d1d02489..9b73a3dbd4 100644 --- a/stdsimd/mod.rs +++ b/stdsimd/mod.rs @@ -1,10 +1,12 @@ +//! `stdsimd` + /// SIMD and vendor intrinsics module. /// -/// This module is intended to be the gateway to architecture-specific intrinsic -/// functions, typically related to SIMD (but not always!). Each architecture -/// that Rust compiles to may contain a submodule here, which means that this is -/// not a portable module! If you're writing a portable library take care when -/// using these APIs! +/// This module is intended to be the gateway to architecture-specific +/// intrinsic functions, typically related to SIMD (but not always!). Each +/// architecture that Rust compiles to may contain a submodule here, which +/// means that this is not a portable module! If you're writing a portable +/// library take care when using these APIs! /// /// Under this module you'll find an architecture-named module, such as /// `x86_64`. Each `#[cfg(target_arch)]` that Rust can compile to may have a @@ -42,13 +44,13 @@ /// /// # CPU Feature Detection /// -/// In order to call these APIs in a safe fashion there's a number of mechanisms -/// available to ensure that the correct CPU feature is available to call an -/// intrinsic. Let's consider, for example, the `_mm256_add_epi64` intrinsics on -/// the `x86` and `x86_64` architectures. This function requires the AVX2 -/// feature as [documented by Intel][intel-dox] so to correctly call this -/// function we need to (a) guarantee we only call it on x86/x86_64 and (b) -/// ensure that the CPU feature is available +/// In order to call these APIs in a safe fashion there's a number of +/// mechanisms available to ensure that the correct CPU feature is available +/// to call an intrinsic. Let's consider, for example, the `_mm256_add_epi64` +/// intrinsics on the `x86` and `x86_64` architectures. This function requires +/// the AVX2 feature as [documented by Intel][intel-dox] so to correctly call +/// this function we need to (a) guarantee we only call it on `x86`/`x86_64` and +/// (b) ensure that the CPU feature is available /// /// [intel-dox]: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_epi64&expand=100 /// @@ -80,9 +82,9 @@ /// `#[cfg]` to only compile the code in situations where the safety guarantees /// are upheld. /// -/// Statically enabling a feature is typically done with the `-C target-feature` -/// or `-C target-cpu` flags to the compiler. For example if your local CPU -/// supports AVX2 then you can compile the above function with: +/// Statically enabling a feature is typically done with the `-C +/// target-feature` or `-C target-cpu` flags to the compiler. For example if +/// your local CPU supports AVX2 then you can compile the above function with: /// /// ```sh /// $ RUSTFLAGS='-C target-cpu=native' cargo build @@ -107,8 +109,8 @@ /// sections more optimized for different CPUs. /// /// Taking our previous example from before, we're going to compile our binary -/// *without* AVX2 support, but we'd like to enable it for just one function. We -/// can do that in a manner like: +/// *without* AVX2 support, but we'd like to enable it for just one function. +/// We can do that in a manner like: /// /// ```ignore /// fn foo() { @@ -141,14 +143,15 @@ /// the standard library, this macro will perform necessary runtime detection /// to determine whether the CPU the program is running on supports the /// specified feature. In this case the macro will expand to a boolean -/// expression evaluating to whether the local CPU has the AVX2 feature or not. +/// expression evaluating to whether the local CPU has the AVX2 feature or +/// not. /// /// Note that this macro, like the `arch` module, is platform-specific. The /// name of the macro is the same across platforms, but the arguments to the /// macro are only the features for the current platform. For example calling /// `is_target_feature_detected!("avx2")` on ARM will be a compile time /// error. To ensure we don't hit this error a statement level `#[cfg]` is -/// used to only compile usage of the macro on x86/x86_64. +/// used to only compile usage of the macro on `x86`/`x86_64`. /// /// * Next up we see our AVX2-enabled function, `foo_avx2`. This function is /// decorated with the `#[target_feature]` attribute which enables a CPU @@ -166,9 +169,9 @@ /// /// # Ergonomics /// -/// It's important to note that using the `arch` module is not the easiest thing -/// in the world, so if you're curious to try it out you may want to brace -/// yourself for some wordiness! +/// It's important to note that using the `arch` module is not the easiest +/// thing in the world, so if you're curious to try it out you may want to +/// brace yourself for some wordiness! /// /// The primary purpose of this module is to enable stable crates on crates.io /// to build up much more ergonomic abstractions which end up using SIMD under @@ -181,15 +184,15 @@ /// This documentation is only for one particular architecture, you can find /// others at: /// -/// * [x86] -/// * [x86_64] -/// * [arm] -/// * [aarch64] +/// * [`x86`] +/// * [`x86_64`] +/// * [`arm`] +/// * [`aarch64`] /// -/// [x86]: https://rust-lang-nursery.github.io/stdsimd/i686/stdsimd/arch/x86/index.html -/// [x86_64]: https://rust-lang-nursery.github.io/stdsimd/x86_64/stdsimd/arch/x86_64/index.html -/// [arm]: https://rust-lang-nursery.github.io/stdsimd/arm/stdsimd/arch/arm/index.html -/// [aarch64]: https://rust-lang-nursery.github.io/stdsimd/aarch64/stdsimd/arch/aarch64/index.html +/// [`x86`]: https://rust-lang-nursery.github.io/stdsimd/i686/stdsimd/arch/x86/index.html +/// [`x86_64`]: https://rust-lang-nursery.github.io/stdsimd/x86_64/stdsimd/arch/x86_64/index.html +/// [`arm`]: https://rust-lang-nursery.github.io/stdsimd/arm/stdsimd/arch/arm/index.html +/// [`aarch64`]: https://rust-lang-nursery.github.io/stdsimd/aarch64/stdsimd/arch/aarch64/index.html /// /// # Examples /// @@ -336,7 +339,6 @@ /// } /// } /// ``` - #[unstable(feature = "stdsimd", issue = "0")] pub mod arch { #[cfg(target_arch = "x86")]