diff --git a/ci/run.sh b/ci/run.sh
index ffa36bdcec..1fa475f27f 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -5,6 +5,8 @@ set -ex
 # Tests are all super fast anyway, and they fault often enough on travis that
 # having only one thread increases debuggability to be worth it.
 export RUST_TEST_THREADS=1
+export RUST_BACKTRACE=1
+export RUST_TEST_NOCAPTURE=1
 
 # FIXME(rust-lang-nursery/stdsimd#120) run-time feature detection for ARM Neon
 case ${TARGET} in
diff --git a/coresimd/aarch64/neon.rs b/coresimd/aarch64/neon.rs
index 60929f8fe9..30adbf309b 100644
--- a/coresimd/aarch64/neon.rs
+++ b/coresimd/aarch64/neon.rs
@@ -5,7 +5,7 @@
 #[cfg(test)]
 use stdsimd_test::assert_instr;
 use coresimd::simd_llvm::simd_add;
-use coresimd::v128::f64x2;
+use coresimd::simd::*;
 
 /// Vector add.
 #[inline]
diff --git a/coresimd/arm/neon.rs b/coresimd/arm/neon.rs
index 4f583728f5..70f39fc391 100644
--- a/coresimd/arm/neon.rs
+++ b/coresimd/arm/neon.rs
@@ -4,8 +4,8 @@
 use stdsimd_test::assert_instr;
 
 use coresimd::simd_llvm::simd_add;
-use coresimd::v64::*;
-use coresimd::v128::*;
+use coresimd::simd::*;
+use convert::From;
 
 /// Vector add.
 #[inline]
@@ -140,8 +140,8 @@ pub unsafe fn vaddq_f32(a: f32x4, b: f32x4) -> f32x4 {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(saddl))]
 pub unsafe fn vaddl_s8(a: i8x8, b: i8x8) -> i16x8 {
-    let a = a.as_i16x8();
-    let b = b.as_i16x8();
+    let a = i16x8::from(a);
+    let b = i16x8::from(b);
     simd_add(a, b)
 }
 
@@ -150,8 +150,8 @@ pub unsafe fn vaddl_s8(a: i8x8, b: i8x8) -> i16x8 {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(saddl))]
 pub unsafe fn vaddl_s16(a: i16x4, b: i16x4) -> i32x4 {
-    let a = a.as_i32x4();
-    let b = b.as_i32x4();
+    let a = i32x4::from(a);
+    let b = i32x4::from(b);
     simd_add(a, b)
 }
 
@@ -160,8 +160,8 @@ pub unsafe fn vaddl_s16(a: i16x4, b: i16x4) -> i32x4 {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(saddl))]
 pub unsafe fn vaddl_s32(a: i32x2, b: i32x2) -> i64x2 {
-    let a = a.as_i64x2();
-    let b = b.as_i64x2();
+    let a = i64x2::from(a);
+    let b = i64x2::from(b);
     simd_add(a, b)
 }
 
@@ -170,8 +170,8 @@ pub unsafe fn vaddl_s32(a: i32x2, b: i32x2) -> i64x2 {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uaddl))]
 pub unsafe fn vaddl_u8(a: u8x8, b: u8x8) -> u16x8 {
-    let a = a.as_u16x8();
-    let b = b.as_u16x8();
+    let a = u16x8::from(a);
+    let b = u16x8::from(b);
     simd_add(a, b)
 }
 
@@ -180,8 +180,8 @@ pub unsafe fn vaddl_u8(a: u8x8, b: u8x8) -> u16x8 {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uaddl))]
 pub unsafe fn vaddl_u16(a: u16x4, b: u16x4) -> u32x4 {
-    let a = a.as_u32x4();
-    let b = b.as_u32x4();
+    let a = u32x4::from(a);
+    let b = u32x4::from(b);
     simd_add(a, b)
 }
 
@@ -190,8 +190,8 @@ pub unsafe fn vaddl_u16(a: u16x4, b: u16x4) -> u32x4 {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uaddl))]
 pub unsafe fn vaddl_u32(a: u32x2, b: u32x2) -> u64x2 {
-    let a = a.as_u64x2();
-    let b = b.as_u64x2();
+    let a = u64x2::from(a);
+    let b = u64x2::from(b);
     simd_add(a, b)
 }
 
diff --git a/coresimd/macros.rs b/coresimd/macros.rs
deleted file mode 100644
index bf1bba42a9..0000000000
--- a/coresimd/macros.rs
+++ /dev/null
@@ -1,574 +0,0 @@
-//! Utility macros
-
-macro_rules! define_ty {
-    ($name:ident, $($elty:ident),+) => {
-        #[repr(simd)]
-        #[derive(Clone, Copy, Debug, PartialEq)]
-        #[allow(non_camel_case_types)]
-        pub struct $name($($elty),*);
-    }
-}
-
-macro_rules! define_ty_doc {
-    ($name:ident, $($elty:ident),+ | $(#[$doc:meta])*) => {
-        $(#[$doc])*
-        #[repr(simd)]
-        #[derive(Clone, Copy, Debug, PartialEq)]
-        #[allow(non_camel_case_types)]
-        pub struct $name($($elty),*);
-    }
-}
-
-macro_rules! define_impl {
-    (
-        $name:ident, $elemty:ident, $nelems:expr, $boolname:ident,
-        $($elname:ident),+
-    ) => {
-        impl $name {
-            #[inline(always)]
-            pub const fn new($($elname: $elemty),*) -> $name {
-                $name($($elname),*)
-            }
-
-            #[inline(always)]
-            pub fn len() -> i32 {
-                $nelems
-            }
-
-            #[inline(always)]
-            pub const fn splat(value: $elemty) -> $name {
-                $name($({
-                    #[allow(non_camel_case_types, dead_code)]
-                    struct $elname;
-                    value
-                }),*)
-            }
-
-            #[inline(always)]
-            pub fn extract(self, idx: u32) -> $elemty {
-                assert!(idx < $nelems);
-                unsafe { self.extract_unchecked(idx) }
-            }
-
-            #[inline(always)]
-            pub unsafe fn extract_unchecked(self, idx: u32) -> $elemty {
-                simd_extract(self, idx)
-            }
-
-            #[inline(always)]
-            pub fn replace(self, idx: u32, val: $elemty) -> $name {
-                assert!(idx < $nelems);
-                unsafe { self.replace_unchecked(idx, val) }
-            }
-
-            #[inline(always)]
-            pub unsafe fn replace_unchecked(
-                self,
-                idx: u32,
-                val: $elemty,
-            ) -> $name {
-                simd_insert(self, idx, val)
-            }
-
-            #[inline(always)]
-            pub fn store(self, slice: &mut [$elemty], offset: usize) {
-                assert!(slice[offset..].len() >= $nelems);
-                unsafe { self.store_unchecked(slice, offset) }
-            }
-
-            #[inline(always)]
-            pub unsafe fn store_unchecked(
-                self,
-                slice: &mut [$elemty],
-                offset: usize,
-            ) {
-                use mem::size_of;
-                use ptr;
-
-                ptr::copy_nonoverlapping(
-                    &self as *const $name as *const u8,
-                    slice.get_unchecked_mut(offset) as *mut $elemty as *mut u8,
-                    size_of::<$name>());
-            }
-
-            #[inline(always)]
-            pub fn load(slice: &[$elemty], offset: usize) -> $name {
-                assert!(slice[offset..].len() >= $nelems);
-                unsafe { $name::load_unchecked(slice, offset) }
-            }
-
-            #[inline(always)]
-            pub unsafe fn load_unchecked(
-                slice: &[$elemty],
-                offset: usize,
-            ) -> $name {
-                use mem::size_of;
-                use ptr;
-
-                let mut x = $name::splat(0 as $elemty);
-                ptr::copy_nonoverlapping(
-                    slice.get_unchecked(offset) as *const $elemty as *const u8,
-                    &mut x as *mut $name as *mut u8,
-                    size_of::<$name>());
-                x
-            }
-
-            #[inline(always)]
-            pub fn eq(self, other: $name) -> $boolname {
-                unsafe { simd_eq(self, other) }
-            }
-
-            #[inline(always)]
-            pub fn ne(self, other: $name) -> $boolname {
-                unsafe { simd_ne(self, other) }
-            }
-
-            #[inline(always)]
-            pub fn lt(self, other: $name) -> $boolname {
-                unsafe { simd_lt(self, other) }
-            }
-
-            #[inline(always)]
-            pub fn le(self, other: $name) -> $boolname {
-                unsafe { simd_le(self, other) }
-            }
-
-            #[inline(always)]
-            pub fn gt(self, other: $name) -> $boolname {
-                unsafe { simd_gt(self, other) }
-            }
-
-            #[inline(always)]
-            pub fn ge(self, other: $name) -> $boolname {
-                unsafe { simd_ge(self, other) }
-            }
-        }
-    }
-}
-
-macro_rules! define_from {
-    ($to:ident, $($from:ident),+) => {
-        $(
-            impl From<$from> for $to {
-                #[inline(always)]
-                fn from(f: $from) -> $to {
-                    unsafe { ::mem::transmute(f) }
-                }
-            }
-        )+
-    }
-}
-
-macro_rules! define_common_ops {
-    ($($ty:ident),+) => {
-        $(
-            impl ::ops::Add for $ty {
-                type Output = Self;
-                #[inline(always)]
-                fn add(self, other: Self) -> Self {
-                    unsafe { simd_add(self, other) }
-                }
-            }
-
-            impl ::ops::Sub for $ty {
-                type Output = Self;
-                #[inline(always)]
-                fn sub(self, other: Self) -> Self {
-                    unsafe { simd_sub(self, other) }
-                }
-            }
-
-            impl ::ops::Mul for $ty {
-                type Output = Self;
-                #[inline(always)]
-                fn mul(self, other: Self) -> Self {
-                    unsafe { simd_mul(self, other) }
-                }
-            }
-
-            impl ::ops::Div for $ty {
-                type Output = Self;
-                #[inline(always)]
-                fn div(self, other: Self) -> Self {
-                    unsafe { simd_div(self, other) }
-                }
-            }
-
-            impl ::ops::Rem for $ty {
-                type Output = Self;
-                #[inline(always)]
-                fn rem(self, other: Self) -> Self {
-                    unsafe { simd_rem(self, other) }
-                }
-            }
-
-            impl ::ops::AddAssign for $ty {
-                #[inline(always)]
-                fn add_assign(&mut self, other: Self) {
-                    *self = *self + other;
-                }
-            }
-
-            impl ::ops::SubAssign for $ty {
-                #[inline(always)]
-                fn sub_assign(&mut self, other: Self) {
-                    *self = *self - other;
-                }
-            }
-
-            impl ::ops::MulAssign for $ty {
-                #[inline(always)]
-                fn mul_assign(&mut self, other: Self) {
-                    *self = *self * other;
-                }
-            }
-
-            impl ::ops::DivAssign for $ty {
-                #[inline(always)]
-                fn div_assign(&mut self, other: Self) {
-                    *self = *self / other;
-                }
-            }
-
-            impl ::ops::RemAssign for $ty {
-                #[inline(always)]
-                fn rem_assign(&mut self, other: Self) {
-                    *self = *self % other;
-                }
-            }
-
-        )+
-    }
-}
-
-macro_rules! define_shifts {
-    ($ty:ident, $elem:ident, $($by:ident),+) => {
-        $(
-            impl ::ops::Shl<$by> for $ty {
-                type Output = Self;
-                #[inline(always)]
-                fn shl(self, other: $by) -> Self {
-                    unsafe { simd_shl(self, $ty::splat(other as $elem)) }
-                }
-            }
-            impl ::ops::Shr<$by> for $ty {
-                type Output = Self;
-                #[inline(always)]
-                fn shr(self, other: $by) -> Self {
-                    unsafe { simd_shr(self, $ty::splat(other as $elem)) }
-                }
-            }
-
-            impl ::ops::ShlAssign<$by> for $ty {
-                #[inline(always)]
-                fn shl_assign(&mut self, other: $by) {
-                    *self = *self << other;
-                }
-            }
-            impl ::ops::ShrAssign<$by> for $ty {
-                #[inline(always)]
-                fn shr_assign(&mut self, other: $by) {
-                    *self = *self >> other;
-                }
-            }
-
-        )+
-    }
-}
-
-macro_rules! define_float_ops {
-    ($($ty:ident),+) => {
-        $(
-            impl ::ops::Neg for $ty {
-                type Output = Self;
-                #[inline(always)]
-                fn neg(self) -> Self {
-                    Self::splat(-1.0) * self
-                }
-            }
-        )+
-    };
-}
-
-macro_rules! define_signed_integer_ops {
-    ($($ty:ident),+) => {
-        $(
-            impl ::ops::Neg for $ty {
-                type Output = Self;
-                #[inline(always)]
-                fn neg(self) -> Self {
-                    Self::splat(-1) * self
-                }
-            }
-        )+
-    };
-}
-
-macro_rules! define_integer_ops {
-    ($(($ty:ident, $elem:ident)),+) => {
-        $(
-            impl ::ops::Not for $ty {
-                type Output = Self;
-                #[inline(always)]
-                fn not(self) -> Self {
-                    $ty::splat(!0) ^ self
-                }
-            }
-
-            impl ::ops::BitAnd for $ty {
-                type Output = Self;
-                #[inline(always)]
-                fn bitand(self, other: Self) -> Self {
-                    unsafe { simd_and(self, other) }
-                }
-            }
-            impl ::ops::BitOr for $ty {
-                type Output = Self;
-                #[inline(always)]
-                fn bitor(self, other: Self) -> Self {
-                    unsafe { simd_or(self, other) }
-                }
-            }
-            impl ::ops::BitXor for $ty {
-                type Output = Self;
-                #[inline(always)]
-                fn bitxor(self, other: Self) -> Self {
-                    unsafe { simd_xor(self, other) }
-                }
-            }
-            impl ::ops::BitAndAssign for $ty {
-                #[inline(always)]
-                fn bitand_assign(&mut self, other: Self) {
-                    *self = *self & other;
-                }
-            }
-            impl ::ops::BitOrAssign for $ty {
-                #[inline(always)]
-                fn bitor_assign(&mut self, other: Self) {
-                    *self = *self | other;
-                }
-            }
-            impl ::ops::BitXorAssign for $ty {
-                #[inline(always)]
-                fn bitxor_assign(&mut self, other: Self) {
-                    *self = *self ^ other;
-                }
-            }
-
-            define_shifts!(
-                $ty, $elem,
-                u8, u16, u32, u64, usize,
-                i8, i16, i32, i64, isize);
-
-            impl ::fmt::LowerHex for $ty {
-                fn fmt(&self, f: &mut ::fmt::Formatter)
-                       -> ::fmt::Result {
-                    write!(f, "{}(", stringify!($ty))?;
-                    let n = ::mem::size_of_val(self)
-                        / ::mem::size_of::<$elem>();
-                    for i in 0..n {
-                        if i > 0 {
-                            write!(f, ", ")?;
-                        }
-                        write!(f, "{:#x}", self.extract(i as u32))?;
-                    }
-                    write!(f, ")")
-                }
-            }
-        )+
-    }
-}
-
-macro_rules! define_casts {
-    ($(($fromty:ident, $toty:ident, $cast:ident)),+) => {
-        $(
-            impl $fromty {
-                #[inline(always)]
-                pub fn $cast(self) -> ::coresimd::simd::$toty {
-                    unsafe { simd_cast(self) }
-                }
-            }
-        )+
-    }
-}
-
-#[cfg(test)]
-#[macro_export]
-macro_rules! test_arithmetic_ {
-        ($tn:ident, $zero:expr, $one:expr, $two:expr, $four:expr) => {
-            {
-                 let z = $tn::splat($zero);
-                 let o = $tn::splat($one);
-                 let t = $tn::splat($two);
-                 let f = $tn::splat($four);
-
-                 // add
-                 assert_eq!(z + z, z);
-                 assert_eq!(o + z, o);
-                 assert_eq!(t + z, t);
-                 assert_eq!(t + t, f);
-                 // sub
-                 assert_eq!(z - z, z);
-                 assert_eq!(o - z, o);
-                 assert_eq!(t - z, t);
-                 assert_eq!(f - t, t);
-                 assert_eq!(f - o - o, t);
-                 // mul
-                 assert_eq!(z * z, z);
-                 assert_eq!(z * o, z);
-                 assert_eq!(z * t, z);
-                 assert_eq!(o * t, t);
-                 assert_eq!(t * t, f);
-                 // div
-                 assert_eq!(z / o, z);
-                 assert_eq!(t / o, t);
-                 assert_eq!(f / o, f);
-                 assert_eq!(t / t, o);
-                 assert_eq!(f / t, t);
-                 // rem
-                 assert_eq!(o % o, z);
-                 assert_eq!(f % t, z);
-
-                {
-                    let mut v = z;
-                    assert_eq!(v, z);
-                    v += o;  // add_assign
-                    assert_eq!(v, o);
-                    v -= o; // sub_assign
-                    assert_eq!(v, z);
-                    v = t;
-                    v *= o; // mul_assign
-                    assert_eq!(v, t);
-                    v *= t;
-                    assert_eq!(v, f);
-                    v /= o; // div_assign
-                    assert_eq!(v, f);
-                    v /= t;
-                    assert_eq!(v, t);
-                    v %= t; // rem_assign
-                    assert_eq!(v, z);
-                }
-            }
-        };
-    }
-
-#[cfg(test)]
-#[macro_export]
-macro_rules! test_neg_ {
-        ($tn:ident, $zero:expr, $one:expr, $two:expr, $four:expr) => {
-            {
-                let z = $tn::splat($zero);
-                let o = $tn::splat($one);
-                let t = $tn::splat($two);
-                let f = $tn::splat($four);
-
-                let nz = $tn::splat(-$zero);
-                let no = $tn::splat(-$one);
-                let nt = $tn::splat(-$two);
-                let nf = $tn::splat(-$four);
-
-                assert_eq!(-z, nz);
-                assert_eq!(-o, no);
-                assert_eq!(-t, nt);
-                assert_eq!(-f, nf);
-            }
-        };
-    }
-
-#[cfg(test)]
-#[macro_export]
-macro_rules! test_bit_arithmetic_ {
-    ($tn:ident) => {
-        {
-            let z = $tn::splat(0);
-            let o = $tn::splat(1);
-            let t = $tn::splat(2);
-            let f = $tn::splat(4);
-            let m = $tn::splat(!z.extract(0));
-
-            // shr
-            assert_eq!(o >> 1, z);
-            assert_eq!(t >> 1, o);
-            assert_eq!(f >> 1, t);
-            // shl
-            assert_eq!(o << 1, t);
-            assert_eq!(o << 2, f);
-            assert_eq!(t << 1, f);
-            // bitand
-            assert_eq!(o & o, o);
-            assert_eq!(t & t, t);
-            assert_eq!(t & o, z);
-            // bitor
-            assert_eq!(o | o, o);
-            assert_eq!(t | t, t);
-            assert_eq!(z | o, o);
-            // bitxor
-            assert_eq!(o ^ o, z);
-            assert_eq!(t ^ t, z);
-            assert_eq!(z ^ o, o);
-            // not
-            assert_eq!(!z, m);
-            assert_eq!(!m, z);
-
-            {  // shr_assign
-                let mut v = o;
-                v >>= 1;
-                assert_eq!(v, z);
-            }
-            {  // shl_assign
-                let mut v = o;
-                v <<= 1;
-                assert_eq!(v, t);
-            }
-            {  // and_assign
-                let mut v = o;
-                v &= t;
-                assert_eq!(v, z);
-            }
-            {  // or_assign
-                let mut v = z;
-                v |= o;
-                assert_eq!(v, o);
-            }
-            {  // xor_assign
-                let mut v = z;
-                v ^= o;
-                assert_eq!(v, o);
-            }
-        }
-    };
-}
-
-#[cfg(test)]
-#[macro_export]
-macro_rules! test_ops_si {
-        ($($tn:ident),+) => {
-            $(
-                test_arithmetic_!($tn, 0, 1, 2, 4);
-                test_neg_!($tn, 0, 1, 2, 4);
-                test_bit_arithmetic_!($tn);
-            )+
-        };
-    }
-
-#[cfg(test)]
-#[macro_export]
-macro_rules! test_ops_ui {
-        ($($tn:ident),+) => {
-            $(
-                test_arithmetic_!($tn, 0, 1, 2, 4);
-                test_bit_arithmetic_!($tn);
-            )+
-        };
-    }
-
-#[cfg(test)]
-#[macro_export]
-macro_rules! test_ops_f {
-        ($($tn:ident),+)  => {
-            $(
-                test_arithmetic_!($tn, 0., 1., 2., 4.);
-                test_neg_!($tn, 0., 1., 2., 4.);
-            )+
-        };
-    }
diff --git a/coresimd/mod.rs b/coresimd/mod.rs
index 2204bce4c9..38cbe92cf9 100644
--- a/coresimd/mod.rs
+++ b/coresimd/mod.rs
@@ -1,14 +1,16 @@
+//! `coresimd`
+
+#[macro_use]
+mod ppsv;
+
 /// Platform independent SIMD vector types and operations.
 ///
-/// This is an **unstable** module for portable SIMD operations. This module has
-/// not yet gone through an RFC and is likely to change, but feedback is always
-/// welcome!
+/// This is an **unstable** module for portable SIMD operations. This module
+/// has not yet gone through an RFC and is likely to change, but feedback is
+/// always welcome!
 #[unstable(feature = "stdsimd", issue = "0")]
 pub mod simd {
-    pub use coresimd::v128::*;
-    pub use coresimd::v256::*;
-    pub use coresimd::v512::*;
-    pub use coresimd::v64::*;
+    pub use coresimd::ppsv::*;
 }
 
 /// Platform dependent vendor intrinsics.
@@ -23,15 +25,15 @@ pub mod simd {
 /// only one platform it actually contains intrinsics for multiple platforms
 /// compiled in conditionally. For other platforms of stdsimd see:
 ///
-/// * [x86]
-/// * [x86_64]
-/// * [arm]
-/// * [aarch64]
+/// * [`x86`]
+/// * [`x86_64`]
+/// * [`arm`]
+/// * [`aarch64`]
 ///
-/// [x86]: https://rust-lang-nursery.github.io/stdsimd/x86/stdsimd/arch/index.html
-/// [x86_64]: https://rust-lang-nursery.github.io/stdsimd/x86_64/stdsimd/arch/index.html
-/// [arm]: https://rust-lang-nursery.github.io/stdsimd/arm/stdsimd/arch/index.html
-/// [aarch64]: https://rust-lang-nursery.github.io/stdsimd/aarch64/stdsimd/arch/index.html
+/// [`x86`]: https://rust-lang-nursery.github.io/stdsimd/x86/stdsimd/arch/index.html
+/// [`x86_64`]: https://rust-lang-nursery.github.io/stdsimd/x86_64/stdsimd/arch/index.html
+/// [`arm`]: https://rust-lang-nursery.github.io/stdsimd/arm/stdsimd/arch/index.html
+/// [`aarch64`]: https://rust-lang-nursery.github.io/stdsimd/aarch64/stdsimd/arch/index.html
 #[unstable(feature = "stdsimd", issue = "0")]
 pub mod arch {
     /// Platform-specific intrinsics for the `x86` platform.
@@ -69,51 +71,7 @@ pub mod arch {
     }
 }
 
-#[macro_use]
-mod macros;
 mod simd_llvm;
-mod v128;
-mod v256;
-mod v512;
-mod v64;
-
-/// 32-bit wide vector tpyes
-mod v32 {
-    #[cfg(not(test))]
-    use prelude::v1::*;
-    use coresimd::simd_llvm::*;
-
-    define_ty! { i16x2, i16, i16 }
-    define_impl! { i16x2, i16, 2, i16x2, x0, x1 }
-    define_ty! { u16x2, u16, u16 }
-    define_impl! { u16x2, u16, 2, i16x2, x0, x1 }
-
-    define_ty! { i8x4, i8, i8, i8, i8 }
-    define_impl! { i8x4, i8, 4, i8x4, x0, x1, x2, x3 }
-    define_ty! { u8x4, u8, u8, u8, u8 }
-    define_impl! { u8x4, u8, 4, i8x4, x0, x1, x2, x3 }
-
-    define_casts!(
-        (i16x2, i64x2, as_i64x2),
-        (u16x2, i64x2, as_i64x2),
-        (i8x4, i32x4, as_i32x4),
-        (u8x4, i32x4, as_i32x4)
-    );
-}
-
-/// 16-bit wide vector tpyes
-mod v16 {
-    #[cfg(not(test))]
-    use prelude::v1::*;
-    use coresimd::simd_llvm::*;
-
-    define_ty! { i8x2, i8, i8 }
-    define_impl! { i8x2, i8, 2, i8x2, x0, x1 }
-    define_ty! { u8x2, u8, u8 }
-    define_impl! { u8x2, u8, 2, i8x2, x0, x1 }
-
-    define_casts!((i8x2, i64x2, as_i64x2), (u8x2, i64x2, as_i64x2));
-}
 
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 mod x86;
diff --git a/coresimd/ppsv/api/arithmetic_ops.rs b/coresimd/ppsv/api/arithmetic_ops.rs
new file mode 100644
index 0000000000..e98745014b
--- /dev/null
+++ b/coresimd/ppsv/api/arithmetic_ops.rs
@@ -0,0 +1,142 @@
+//! Lane-wise arithmetic operations.
+
+macro_rules! impl_arithmetic_ops {
+    ($id:ident) => {
+        impl ops::Add for $id {
+            type Output = Self;
+            #[inline]
+            fn add(self, other: Self) -> Self {
+                unsafe { simd_add(self, other) }
+            }
+        }
+
+        impl ops::Sub for $id {
+            type Output = Self;
+            #[inline]
+            fn sub(self, other: Self) -> Self {
+                unsafe { simd_sub(self, other) }
+            }
+        }
+
+        impl ops::Mul for $id {
+            type Output = Self;
+            #[inline]
+            fn mul(self, other: Self) -> Self {
+                unsafe { simd_mul(self, other) }
+            }
+        }
+
+        impl ops::Div for $id {
+            type Output = Self;
+            #[inline]
+            fn div(self, other: Self) -> Self {
+                unsafe { simd_div(self, other) }
+            }
+        }
+
+        impl ops::Rem for $id {
+            type Output = Self;
+            #[inline]
+            fn rem(self, other: Self) -> Self {
+                unsafe { simd_rem(self, other) }
+            }
+        }
+
+        impl ops::AddAssign for $id {
+            #[inline]
+            fn add_assign(&mut self, other: Self) {
+                *self = *self + other;
+            }
+        }
+
+        impl ops::SubAssign for $id {
+            #[inline]
+            fn sub_assign(&mut self, other: Self) {
+                *self = *self - other;
+            }
+        }
+
+        impl ops::MulAssign for $id {
+            #[inline]
+            fn mul_assign(&mut self, other: Self) {
+                *self = *self * other;
+            }
+        }
+
+        impl ops::DivAssign for $id {
+            #[inline]
+            fn div_assign(&mut self, other: Self) {
+                *self = *self / other;
+            }
+        }
+
+        impl ops::RemAssign for $id {
+            #[inline]
+            fn rem_assign(&mut self, other: Self) {
+                *self = *self % other;
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+#[macro_export]
+macro_rules! test_arithmetic_ops {
+    ($id:ident, $elem_ty:ident) => {
+        #[test]
+        fn arithmetic() {
+            use ::coresimd::simd::$id;
+            let z = $id::splat(0 as $elem_ty);
+            let o = $id::splat(1 as $elem_ty);
+            let t = $id::splat(2 as $elem_ty);
+            let f = $id::splat(4 as $elem_ty);
+
+            // add
+            assert_eq!(z + z, z);
+            assert_eq!(o + z, o);
+            assert_eq!(t + z, t);
+            assert_eq!(t + t, f);
+            // sub
+            assert_eq!(z - z, z);
+            assert_eq!(o - z, o);
+            assert_eq!(t - z, t);
+            assert_eq!(f - t, t);
+            assert_eq!(f - o - o, t);
+            // mul
+            assert_eq!(z * z, z);
+            assert_eq!(z * o, z);
+            assert_eq!(z * t, z);
+            assert_eq!(o * t, t);
+            assert_eq!(t * t, f);
+            // div
+            assert_eq!(z / o, z);
+            assert_eq!(t / o, t);
+            assert_eq!(f / o, f);
+            assert_eq!(t / t, o);
+            assert_eq!(f / t, t);
+            // rem
+            assert_eq!(o % o, z);
+            assert_eq!(f % t, z);
+
+            {
+                let mut v = z;
+                assert_eq!(v, z);
+                v += o;  // add_assign
+                assert_eq!(v, o);
+                v -= o; // sub_assign
+                assert_eq!(v, z);
+                v = t;
+                v *= o; // mul_assign
+                assert_eq!(v, t);
+                v *= t;
+                assert_eq!(v, f);
+                v /= o; // div_assign
+                assert_eq!(v, f);
+                v /= t;
+                assert_eq!(v, t);
+                v %= t; // rem_assign
+                assert_eq!(v, z);
+            }
+        }
+    };
+}
diff --git a/coresimd/ppsv/api/arithmetic_reductions.rs b/coresimd/ppsv/api/arithmetic_reductions.rs
new file mode 100644
index 0000000000..54fba4871d
--- /dev/null
+++ b/coresimd/ppsv/api/arithmetic_reductions.rs
@@ -0,0 +1,64 @@
+//! Implements portable arithmetic vector reductions.
+
+macro_rules! impl_arithmetic_reductions {
+    ($id:ident, $elem_ty:ident) => {
+        impl $id {
+            /// Lane-wise addition of the vector elements.
+            #[inline]
+            pub fn sum(self) -> $elem_ty {
+                ReduceAdd::reduce_add(self)
+            }
+            /// Lane-wise multiplication of the vector elements.
+            #[inline]
+            pub fn product(self) -> $elem_ty {
+                ReduceMul::reduce_mul(self)
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+macro_rules! test_arithmetic_reductions {
+    ($id:ident, $elem_ty:ident) => {
+
+        fn alternating(x: usize) -> ::coresimd::simd::$id {
+            use ::coresimd::simd::$id;
+            let mut v = $id::splat(1 as $elem_ty);
+            for i in 0..$id::lanes() {
+                if i % x == 0 {
+                    v = v.replace(i, 2 as $elem_ty);
+                }
+            }
+            v
+        }
+
+        #[test]
+        fn sum() {
+            use ::coresimd::simd::$id;
+            let v = $id::splat(0 as $elem_ty);
+            assert_eq!(v.sum(), 0 as $elem_ty);
+            let v = $id::splat(1 as $elem_ty);
+            assert_eq!(v.sum(), $id::lanes() as $elem_ty);
+            let v = alternating(2);
+            eprintln!("{:?}", v);
+            assert_eq!(v.sum(), ($id::lanes() / 2 + $id::lanes()) as $elem_ty);
+        }
+        #[test]
+        fn product() {
+            use ::coresimd::simd::$id;
+            let v = $id::splat(0 as $elem_ty);
+            assert_eq!(v.product(), 0 as $elem_ty);
+            let v = $id::splat(1 as $elem_ty);
+            assert_eq!(v.product(), 1 as $elem_ty);
+            let f = match $id::lanes() {
+                64 => 16,
+                32 => 8,
+                16 => 4,
+                _ => 2,
+            };
+            let v = alternating(f);
+            eprintln!("{:?}", v);
+            assert_eq!(v.product(), (2_usize.pow(($id::lanes() / f) as u32) as $elem_ty));
+        }
+    }
+}
diff --git a/coresimd/ppsv/api/bitwise_ops.rs b/coresimd/ppsv/api/bitwise_ops.rs
new file mode 100644
index 0000000000..aa82b2e797
--- /dev/null
+++ b/coresimd/ppsv/api/bitwise_ops.rs
@@ -0,0 +1,171 @@
+//! Lane-wise bitwise operations for integer and boolean vectors.
+
+macro_rules! impl_bitwise_ops {
+    ($ty:ident, $true_val:expr) => {
+        impl ops::Not for $ty {
+            type Output = Self;
+            #[inline]
+            fn not(self) -> Self {
+                Self::splat($true_val) ^ self
+            }
+        }
+        impl ops::BitXor for $ty {
+            type Output = Self;
+            #[inline]
+            fn bitxor(self, other: Self) -> Self {
+                unsafe { simd_xor(self, other) }
+            }
+        }
+        impl ops::BitAnd for $ty {
+            type Output = Self;
+            #[inline]
+            fn bitand(self, other: Self) -> Self {
+                unsafe { simd_and(self, other) }
+            }
+        }
+        impl ops::BitOr for $ty {
+            type Output = Self;
+            #[inline]
+            fn bitor(self, other: Self) -> Self {
+                unsafe { simd_or(self, other) }
+            }
+        }
+        impl ops::BitAndAssign for $ty {
+            #[inline]
+            fn bitand_assign(&mut self, other: Self) {
+                *self = *self & other;
+            }
+        }
+        impl ops::BitOrAssign for $ty {
+            #[inline]
+            fn bitor_assign(&mut self, other: Self) {
+                *self = *self | other;
+            }
+        }
+        impl ops::BitXorAssign for $ty {
+            #[inline]
+            fn bitxor_assign(&mut self, other: Self) {
+                *self = *self ^ other;
+            }
+        }
+    };
+}
+
+#[cfg(test)]
+#[macro_export]
+macro_rules! test_int_bitwise_ops {
+    ($id:ident, $elem_ty:ident) => {
+        #[test]
+        fn bitwise_ops() {
+            use ::coresimd::simd::$id;
+            let z = $id::splat(0 as $elem_ty);
+            let o = $id::splat(1 as $elem_ty);
+            let t = $id::splat(2 as $elem_ty);
+            let m = $id::splat(!z.extract(0));
+
+            // Not:
+            assert_eq!(!z, m);
+            assert_eq!(!m, z);
+
+            // BitAnd:
+            assert_eq!(o & o, o);
+            assert_eq!(o & z, z);
+            assert_eq!(z & o, z);
+            assert_eq!(z & z, z);
+
+            assert_eq!(t & t, t);
+            assert_eq!(t & o, z);
+            assert_eq!(o & t, z);
+
+            // BitOr:
+            assert_eq!(o | o, o);
+            assert_eq!(o | z, o);
+            assert_eq!(z | o, o);
+            assert_eq!(z | z, z);
+
+            assert_eq!(t | t, t);
+            assert_eq!(z | t, t);
+            assert_eq!(t | z, t);
+
+            // BitXOR:
+            assert_eq!(o ^ o, z);
+            assert_eq!(z ^ z, z);
+            assert_eq!(z ^ o, o);
+            assert_eq!(o ^ z, o);
+
+            assert_eq!(t ^ t, z);
+            assert_eq!(t ^ z, t);
+            assert_eq!(z ^ t, t);
+
+            {  // AndAssign:
+                let mut v = o;
+                v &= t;
+                assert_eq!(v, z);
+            }
+            {  // OrAssign:
+                let mut v = z;
+                v |= o;
+                assert_eq!(v, o);
+            }
+            {  // XORAssign:
+                let mut v = z;
+                v ^= o;
+                assert_eq!(v, o);
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+#[macro_export]
+macro_rules! test_bool_bitwise_ops {
+    ($id:ident) => {
+        #[test]
+        fn bool_arithmetic() {
+            use ::coresimd::simd::*;
+
+            let t = $id::splat(true);
+            let f = $id::splat(false);
+            assert!(t != f);
+            assert!(!(t == f));
+
+            // Not:
+            assert_eq!(!t, f);
+            assert_eq!(t, !f);
+
+            // BitAnd:
+            assert_eq!(t & f, f);
+            assert_eq!(f & t, f);
+            assert_eq!(t & t, t);
+            assert_eq!(f & f, f);
+
+            // BitOr:
+            assert_eq!(t | f, t);
+            assert_eq!(f | t, t);
+            assert_eq!(t | t, t);
+            assert_eq!(f | f, f);
+
+            // BitXOR:
+            assert_eq!(t ^ f, t);
+            assert_eq!(f ^ t, t);
+            assert_eq!(t ^ t, f);
+            assert_eq!(f ^ f, f);
+
+            {  // AndAssign:
+                let mut v = f;
+                v &= t;
+                assert_eq!(v, f);
+            }
+            {  // OrAssign:
+                let mut v = f;
+                v |= t;
+                assert_eq!(v, t);
+            }
+            {  // XORAssign:
+                let mut v = f;
+                v ^= t;
+                assert_eq!(v, t);
+            }
+        }
+    }
+}
diff --git a/coresimd/ppsv/api/bitwise_reductions.rs b/coresimd/ppsv/api/bitwise_reductions.rs
new file mode 100644
index 0000000000..ae598e5b98
--- /dev/null
+++ b/coresimd/ppsv/api/bitwise_reductions.rs
@@ -0,0 +1,97 @@
+//! Implements portable bitwise vector reductions.
+
+macro_rules! impl_bitwise_reductions {
+    ($id:ident, $elem_ty:ident) => {
+        impl $id {
+            /// Lane-wise bitwise `and` of the vector elements.
+            #[inline]
+            pub fn and(self) -> $elem_ty {
+                ReduceAnd::reduce_and(self)
+            }
+            /// Lane-wise bitwise `or` of the vector elements.
+            #[inline]
+            pub fn or(self) -> $elem_ty {
+                ReduceOr::reduce_or(self)
+            }
+            /// Lane-wise bitwise `xor` of the vector elements.
+            #[inline]
+            pub fn xor(self) -> $elem_ty {
+                ReduceXor::reduce_xor(self)
+            }
+        }
+    }
+}
+
+macro_rules! impl_bool_bitwise_reductions {
+    ($id:ident, $elem_ty:ident) => {
+        impl $id {
+            /// Lane-wise bitwise `and` of the vector elements.
+            #[inline]
+            pub fn and(self) -> $elem_ty {
+                ReduceAnd::reduce_and(self) !=0
+            }
+            /// Lane-wise bitwise `or` of the vector elements.
+            #[inline]
+            pub fn or(self) -> $elem_ty {
+                ReduceOr::reduce_or(self) != 0
+            }
+            /// Lane-wise bitwise `xor` of the vector elements.
+            #[inline]
+            pub fn xor(self) -> $elem_ty {
+                ReduceXor::reduce_xor(self) != 0
+            }
+        }
+    }
+}
+
+
+#[cfg(test)]
+macro_rules! test_bitwise_reductions {
+    ($id:ident, $true:expr) => {
+        #[test]
+        fn and() {
+            let false_ = !$true;
+            use ::coresimd::simd::$id;
+            let v = $id::splat(false_);
+            assert_eq!(v.and(), false_);
+            let v = $id::splat($true);
+            assert_eq!(v.and(), $true);
+            let v = $id::splat(false_);
+            let v = v.replace(0, $true);
+            assert_eq!(v.and(), false_);
+            let v = $id::splat($true);
+            let v = v.replace(0, false_);
+            assert_eq!(v.and(), false_);
+        }
+        #[test]
+        fn or() {
+            let false_ = !$true;
+            use ::coresimd::simd::$id;
+            let v = $id::splat(false_);
+            assert_eq!(v.or(), false_);
+            let v = $id::splat($true);
+            assert_eq!(v.or(), $true);
+            let v = $id::splat(false_);
+            let v = v.replace(0, $true);
+            assert_eq!(v.or(), $true);
+            let v = $id::splat($true);
+            let v = v.replace(0, false_);
+            assert_eq!(v.or(), $true);
+        }
+        #[test]
+        fn xor() {
+            let false_ = !$true;
+            use ::coresimd::simd::$id;
+            let v = $id::splat(false_);
+            assert_eq!(v.xor(), false_);
+            let v = $id::splat($true);
+            assert_eq!(v.xor(), false_);
+            let v = $id::splat(false_);
+            let v = v.replace(0, $true);
+            assert_eq!(v.xor(), $true);
+            let v = $id::splat($true);
+            let v = v.replace(0, false_);
+            assert_eq!(v.xor(), $true);
+        }
+    }
+}
diff --git a/coresimd/ppsv/api/bool_vectors.rs b/coresimd/ppsv/api/bool_vectors.rs
new file mode 100644
index 0000000000..01fa13f0b8
--- /dev/null
+++ b/coresimd/ppsv/api/bool_vectors.rs
@@ -0,0 +1,147 @@
+//! Minimal boolean vector implementation
+
+/// Minimal interface: all packed SIMD boolean vector types implement this.
+macro_rules! impl_bool_minimal {
+    ($id:ident, $elem_ty:ident, $elem_count:expr, $($elem_name:ident),+) => {
+
+        #[cfg_attr(feature = "cargo-clippy", allow(expl_impl_clone_on_copy))]
+        impl Clone for $id {
+            #[inline] // currently needed for correctness
+            fn clone(&self) -> Self {
+                *self
+            }
+        }
+
+        impl $id {
+            /// Creates a new instance with each vector elements initialized
+            /// with the provided values.
+            #[inline]
+            pub const fn new($($elem_name: bool),*) -> Self {
+                $id($(Self::bool_to_internal($elem_name)),*)
+            }
+
+            /// Converts a boolean type into the type of the vector lanes.
+            #[inline]
+            const fn bool_to_internal(x: bool) -> $elem_ty {
+                [0 as $elem_ty, !(0 as $elem_ty)][x as usize]
+            }
+
+            /// Returns the number of vector lanes.
+            #[inline]
+            pub const fn lanes() -> usize {
+                $elem_count
+            }
+
+            /// Constructs a new instance with each element initialized to
+            /// `value`.
+            #[inline]
+            pub const fn splat(value: bool) -> Self {
+                $id($({
+                    #[allow(non_camel_case_types, dead_code)]
+                    struct $elem_name;
+                    Self::bool_to_internal(value)
+                }),*)
+            }
+
+            /// Extracts the value at `index`.
+            ///
+            /// # Panics
+            ///
+            /// If `index >= Self::lanes()`.
+            #[inline]
+            pub fn extract(self, index: usize) -> bool {
+                assert!(index < $elem_count);
+                unsafe { self.extract_unchecked(index) }
+            }
+
+            /// Extracts the value at `index`.
+            ///
+            /// If `index >= Self::lanes()` the behavior is undefined.
+            #[inline]
+            pub unsafe fn extract_unchecked(self, index: usize) -> bool {
+                let x: $elem_ty = simd_extract(self, index as u32);
+                x != 0
+            }
+
+            /// Returns a new vector where the value at `index` is replaced by `new_value`.
+            ///
+            /// # Panics
+            ///
+            /// If `index >= Self::lanes()`.
+            #[inline]
+            #[must_use = "replace does not modify the original value - it returns a new vector with the value at `index` replaced by `new_value`d"]
+            pub fn replace(self, index: usize, new_value: bool) -> Self {
+                assert!(index < $elem_count);
+                unsafe { self.replace_unchecked(index, new_value) }
+            }
+
+            /// Returns a new vector where the value at `index` is replaced by `new_value`.
+            ///
+            /// # Panics
+            ///
+            /// If `index >= Self::lanes()`.
+            #[inline]
+            #[must_use = "replace_unchecked does not modify the original value - it returns a new vector with the value at `index` replaced by `new_value`d"]
+            pub unsafe fn replace_unchecked(
+                self,
+                index: usize,
+                new_value: bool,
+            ) -> Self {
+                simd_insert(self, index as u32, Self::bool_to_internal(new_value))
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+macro_rules! test_bool_minimal {
+    ($id:ident, $elem_count:expr) => {
+        #[test]
+        fn minimal() {
+            use ::coresimd::simd::$id;
+            // TODO: test new
+
+            // lanes:
+            assert_eq!($elem_count, $id::lanes());
+
+            // splat and extract / extract_unchecked:
+            let vec = $id::splat(true);
+            for i in 0..$id::lanes() {
+                assert_eq!(true, vec.extract(i));
+                assert_eq!(true, unsafe { vec.extract_unchecked(i) });
+            }
+
+            // replace / replace_unchecked
+            let new_vec = vec.replace(1, false);
+            for i in 0..$id::lanes() {
+                if i == 1 {
+                    assert_eq!(false, new_vec.extract(i));
+                } else {
+                    assert_eq!(true, new_vec.extract(i));
+                }
+            }
+            let new_vec = unsafe { vec.replace_unchecked(1, false) };
+            for i in 0..$id::lanes() {
+                if i == 1 {
+                    assert_eq!(false, new_vec.extract(i));
+                } else {
+                    assert_eq!(true, new_vec.extract(i));
+                }
+            }
+        }
+        #[test]
+        #[should_panic]
+        fn minimal_extract_panic_on_out_of_bounds() {
+            use ::coresimd::simd::$id;
+            let vec = $id::splat(false);
+            let _ = vec.extract($id::lanes());
+        }
+        #[test]
+        #[should_panic]
+        fn minimal_replace_panic_on_out_of_bounds() {
+            use ::coresimd::simd::$id;
+            let vec = $id::splat(false);
+            let _ = vec.replace($id::lanes(), true);
+        }
+    }
+}
diff --git a/coresimd/ppsv/api/boolean_reductions.rs b/coresimd/ppsv/api/boolean_reductions.rs
new file mode 100644
index 0000000000..3c45fee48d
--- /dev/null
+++ b/coresimd/ppsv/api/boolean_reductions.rs
@@ -0,0 +1,83 @@
+//! Lane-wise boolean vector reductions.
+
+macro_rules! impl_bool_reductions {
+    ($id:ident) => {
+        impl $id {
+            /// Are `all` vector lanes `true`?
+            #[inline]
+            pub fn all(self) -> bool {
+                self.and()
+            }
+            /// Is `any` vector lanes `true`?
+            #[inline]
+            pub fn any(self) -> bool {
+                self.or()
+            }
+            /// Are `all` vector lanes `false`?
+            #[inline]
+            pub fn none(self) -> bool {
+                !self.or()
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+macro_rules! test_bool_reductions {
+    ($id:ident) => {
+        #[test]
+        fn all() {
+            use ::coresimd::simd::$id;
+
+            let a = $id::splat(true);
+            assert!(a.all());
+            let a = $id::splat(false);
+            assert!(!a.all());
+
+            for i in 0..$id::lanes() {
+                let mut a = $id::splat(true);
+                a = a.replace(i, false);
+                assert!(!a.all());
+                let mut a = $id::splat(false);
+                a = a.replace(i, true);
+                assert!(!a.all());
+            }
+        }
+        #[test]
+        fn any() {
+            use ::coresimd::simd::$id;
+
+            let a = $id::splat(true);
+            assert!(a.any());
+            let a = $id::splat(false);
+            assert!(!a.any());
+
+            for i in 0..$id::lanes() {
+                let mut a = $id::splat(true);
+                a = a.replace(i, false);
+                assert!(a.any());
+                let mut a = $id::splat(false);
+                a = a.replace(i, true);
+                assert!(a.any());
+            }
+        }
+        #[test]
+        fn none() {
+            use ::coresimd::simd::$id;
+
+            let a = $id::splat(true);
+            assert!(!a.none());
+            let a = $id::splat(false);
+            assert!(a.none());
+
+            for i in 0..$id::lanes() {
+                let mut a = $id::splat(true);
+                a = a.replace(i, false);
+                assert!(!a.none());
+                let mut a = $id::splat(false);
+                a = a.replace(i, true);
+                assert!(!a.none());
+            }
+        }
+    }
+}
diff --git a/coresimd/ppsv/api/cmp.rs b/coresimd/ppsv/api/cmp.rs
new file mode 100644
index 0000000000..5eb15d2933
--- /dev/null
+++ b/coresimd/ppsv/api/cmp.rs
@@ -0,0 +1,138 @@
+//! Lane-wise vector comparisons returning boolean vectors.
+
+macro_rules! impl_cmp {
+    ($id:ident, $bool_ty:ident) => {
+        impl $id {
+            /// Lane-wise equality comparison.
+            #[inline]
+            pub fn eq(self, other: $id) -> $bool_ty {
+                unsafe { simd_eq(self, other) }
+            }
+
+            /// Lane-wise inequality comparison.
+            #[inline]
+            pub fn ne(self, other: $id) -> $bool_ty {
+                unsafe { simd_ne(self, other) }
+            }
+
+            /// Lane-wise less-than comparison.
+            #[inline]
+            pub fn lt(self, other: $id) -> $bool_ty {
+                unsafe { simd_lt(self, other) }
+            }
+
+            /// Lane-wise less-than-or-equals comparison.
+            #[inline]
+            pub fn le(self, other: $id) -> $bool_ty {
+                unsafe { simd_le(self, other) }
+            }
+
+            /// Lane-wise greater-than comparison.
+            #[inline]
+            pub fn gt(self, other: $id) -> $bool_ty {
+                unsafe { simd_gt(self, other) }
+            }
+
+            /// Lane-wise greater-than-or-equals comparison.
+            #[inline]
+            pub fn ge(self, other: $id) -> $bool_ty {
+                unsafe { simd_ge(self, other) }
+            }
+        }
+    }
+}
+
+macro_rules! impl_bool_cmp {
+    ($id:ident, $bool_ty:ident) => {
+        impl $id {
+            /// Lane-wise equality comparison.
+            #[inline]
+            pub fn eq(self, other: $id) -> $bool_ty {
+                unsafe { simd_eq(self, other) }
+            }
+
+            /// Lane-wise inequality comparison.
+            #[inline]
+            pub fn ne(self, other: $id) -> $bool_ty {
+                unsafe { simd_ne(self, other) }
+            }
+
+            /// Lane-wise less-than comparison.
+            #[inline]
+            pub fn lt(self, other: $id) -> $bool_ty {
+                unsafe { simd_gt(self, other) }
+            }
+
+            /// Lane-wise less-than-or-equals comparison.
+            #[inline]
+            pub fn le(self, other: $id) -> $bool_ty {
+                unsafe { simd_ge(self, other) }
+            }
+
+            /// Lane-wise greater-than comparison.
+            #[inline]
+            pub fn gt(self, other: $id) -> $bool_ty {
+                unsafe { simd_lt(self, other) }
+            }
+
+            /// Lane-wise greater-than-or-equals comparison.
+            #[inline]
+            pub fn ge(self, other: $id) -> $bool_ty {
+                unsafe { simd_le(self, other) }
+            }
+        }
+    }
+}
+
+
+#[cfg(test)]
+#[macro_export]
+macro_rules! test_cmp {
+    ($id:ident, $elem_ty:ident, $bool_ty:ident,
+     $true:expr, $false:expr) => {
+        #[test]
+        fn cmp() {
+            use ::coresimd::simd::*;
+
+            let a = $id::splat($false);
+            let b = $id::splat($true);
+
+            let r = a.lt(b);
+            let e = $bool_ty::splat(true);
+            eprintln!("0| a: {:?}, b: {:?}, r: {:?}, e: {:?}", a, b, r, e);
+            assert!(r == e);
+            let r = a.le(b);
+            eprintln!("1| a: {:?}, b: {:?}, r: {:?}, e: {:?}", a, b, r, e);
+            assert!(r == e);
+
+            let e = $bool_ty::splat(false);
+            let r = a.gt(b);
+            eprintln!("2| a: {:?}, b: {:?}, r: {:?}, e: {:?}", a, b, r, e);
+            assert!(r == e);
+            let r = a.ge(b);
+            eprintln!("3| a: {:?}, b: {:?}, r: {:?}, e: {:?}", a, b, r, e);
+            assert!(r == e);
+            let r = a.eq(b);
+            eprintln!("4| a: {:?}, b: {:?}, r: {:?}, e: {:?}", a, b, r, e);
+            assert!(r == e);
+
+            let mut a = a;
+            let mut b = b;
+            let mut e = e;
+            for i in 0..$id::lanes() {
+                if i % 2 == 0 {
+                    a = a.replace(i, $false);
+                    b = b.replace(i, $true);
+                    e = e.replace(i, true);
+                } else {
+                    a = a.replace(i, $true);
+                    b = b.replace(i, $false);
+                    e = e.replace(i, false);
+                }
+            }
+            let r = a.lt(b);
+            eprintln!("5| a: {:?}, b: {:?}, r: {:?}, e: {:?}", a, b, r, e);
+            assert!(r == e);
+        }
+    }
+}
diff --git a/coresimd/ppsv/api/default.rs b/coresimd/ppsv/api/default.rs
new file mode 100644
index 0000000000..153bbe1ae3
--- /dev/null
+++ b/coresimd/ppsv/api/default.rs
@@ -0,0 +1,28 @@
+//! Implements `Default` for vector types.
+
+macro_rules! impl_default {
+    ($id:ident, $elem_ty:ident) => {
+        impl Default for $id {
+            #[inline]
+            fn default() -> Self {
+                Self::splat($elem_ty::default())
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+#[macro_export]
+macro_rules! test_default {
+    ($id:ident, $elem_ty:ident) => {
+        #[test]
+        fn default() {
+            use ::coresimd::simd::*;
+            use std::default::Default;
+            let a = $id::default();
+            for i in 0..$id::lanes() {
+                assert_eq!(a.extract(i), $elem_ty::default());
+            }
+        }
+    }
+}
diff --git a/coresimd/ppsv/api/eq.rs b/coresimd/ppsv/api/eq.rs
new file mode 100644
index 0000000000..bcbee31041
--- /dev/null
+++ b/coresimd/ppsv/api/eq.rs
@@ -0,0 +1,5 @@
+//! Implements `Eq` for vector types.
+
+macro_rules! impl_eq {
+    ($id:ident) => { impl Eq for $id {} }
+}
diff --git a/coresimd/ppsv/api/fmt.rs b/coresimd/ppsv/api/fmt.rs
new file mode 100644
index 0000000000..159a049bae
--- /dev/null
+++ b/coresimd/ppsv/api/fmt.rs
@@ -0,0 +1,53 @@
+//! Implements formating traits.
+
+macro_rules! impl_hex_fmt {
+    ($id:ident, $elem_ty:ident) => {
+        impl fmt::LowerHex for $id {
+            fn fmt(&self, f: &mut fmt::Formatter)
+                   -> fmt::Result {
+                write!(f, "{}(", stringify!($id))?;
+                let n = mem::size_of_val(self)
+                    / mem::size_of::<$elem_ty>();
+                for i in 0..n {
+                    if i > 0 {
+                        write!(f, ", ")?;
+                    }
+                    self.extract(i).fmt(f)?;
+                }
+                write!(f, ")")
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+macro_rules! test_hex_fmt_impl {
+    ($id:ident, $elem_ty:ident, $($values:expr),+) => {
+        #[test]
+        fn hex_fmt() {
+            use ::std::prelude::v1::*;
+            use ::coresimd::simd::$id;
+            for &i in [$($values),+].iter() {
+                let vec = $id::splat(i as $elem_ty);
+
+                let s = format!("{:#x}", vec);
+                let beg = format!("{}(", stringify!($id));
+                assert!(s.starts_with(&beg));
+                assert!(s.ends_with(")"));
+                let s: Vec<String> = s.replace(&beg, "").replace(")", "").split(",")
+                    .map(|v| v.trim().to_string()).collect();
+                assert_eq!(s.len(), $id::lanes());
+                for (index, ss) in s.into_iter().enumerate() {
+                    assert_eq!(ss, format!("{:#x}", vec.extract(index)));
+                }
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+macro_rules! test_hex_fmt {
+    ($id:ident, $elem_ty:ident) => {
+        test_hex_fmt_impl!($id, $elem_ty, 0 as $elem_ty, !(0 as $elem_ty), (1 as $elem_ty));
+    }
+}
diff --git a/coresimd/ppsv/api/from.rs b/coresimd/ppsv/api/from.rs
new file mode 100644
index 0000000000..f1008c1ea6
--- /dev/null
+++ b/coresimd/ppsv/api/from.rs
@@ -0,0 +1,32 @@
+//! Implements the From trait for vector types, which performs a lane-wise
+//! cast vector types with the same number of lanes.
+
+macro_rules! impl_from {
+    ($to:ident: $elem_ty:ident, $test_mod:ident | $($from:ident),+) => {
+        $(
+            impl From<::simd::$from> for $to {
+                #[inline]
+                fn from(f: ::simd::$from) -> $to {
+                    unsafe { simd_cast(f) }
+                }
+            }
+        )+
+
+        #[cfg(test)]
+        mod $test_mod {
+            $(
+                #[test]
+                fn $from() {
+                    use ::std::convert::{From, Into};
+                    use ::coresimd::simd::{$from, $to};
+                    use ::std::default::Default;
+                    assert_eq!($to::lanes(), $from::lanes());
+                    let a: $from = $from::default();
+                    let b_0: $to = From::from(a);
+                    let b_1: $to = a.into();
+                    assert_eq!(b_0, b_1);
+                }
+            )+
+        }
+    }
+}
diff --git a/coresimd/ppsv/api/from_bits.rs b/coresimd/ppsv/api/from_bits.rs
new file mode 100644
index 0000000000..43f82696dc
--- /dev/null
+++ b/coresimd/ppsv/api/from_bits.rs
@@ -0,0 +1,39 @@
+//! Implements the `FromBits` trait for vector types, which performs bitwise
+//! lossless transmutes between equally-sized vector types.
+
+macro_rules! impl_from_bits_ {
+    ($to:ident: $($from:ident),+) => {
+        $(
+            impl ::simd::FromBits<$from> for $to {
+                #[inline]
+                fn from_bits(f: $from) -> $to {
+                    unsafe { mem::transmute(f) }
+                }
+            }
+        )+
+    }
+}
+
+macro_rules! impl_from_bits {
+    ($to:ident: $elem_ty:ident, $test_mod:ident | $($from:ident),+) => {
+        impl_from_bits_!($to: $($from),+);
+
+        #[cfg(test)]
+        mod $test_mod {
+            $(
+                #[test]
+                fn $from() {
+                    use ::coresimd::simd::{$from, $to, FromBits, IntoBits};
+                    use ::std::{mem, default};
+                    use default::Default;
+                    assert_eq!(mem::size_of::<$from>(),
+                               mem::size_of::<$to>());
+                    let a: $from = $from::default();
+                    let b_0: $to = FromBits::from_bits(a);
+                    let b_1: $to = a.into_bits();
+                    assert_eq!(b_0, b_1);
+                }
+            )+
+        }
+    }
+}
diff --git a/coresimd/ppsv/api/hash.rs b/coresimd/ppsv/api/hash.rs
new file mode 100644
index 0000000000..47c135b4e1
--- /dev/null
+++ b/coresimd/ppsv/api/hash.rs
@@ -0,0 +1,43 @@
+//! Implements `Hash`.
+
+macro_rules! impl_hash {
+    ($id:ident, $elem_ty:ident) => {
+        impl hash::Hash for $id {
+            #[inline]
+            fn hash<H: hash::Hasher>(&self, state: &mut H) {
+                union A {
+                    data: [$elem_ty; $id::lanes()],
+                    vec: $id
+                }
+                unsafe {
+                    A { vec: *self }.data.hash(state)
+                }
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+#[macro_export]
+macro_rules! test_hash {
+    ($id:ident, $elem_ty:ident) => {
+        #[test]
+        fn hash() {
+            use ::coresimd::simd::$id;
+            use ::std::collections::hash_map::DefaultHasher;
+            use ::std::hash::{Hash, Hasher};
+            use ::std::{mem, clone};
+            use clone::Clone;
+            type A = [$elem_ty; $id::lanes()];
+            let a: A = [42 as $elem_ty; $id::lanes()];
+            assert!(mem::size_of::<A>() == mem::size_of::<$id>());
+            let mut a_hash = DefaultHasher::new();
+            let mut v_hash = a_hash.clone();
+            a.hash(&mut a_hash);
+
+            let v = $id::splat(42 as $elem_ty);
+            v.hash(&mut v_hash);
+            assert_eq!(a_hash.finish(), v_hash.finish());
+        }
+    }
+}
diff --git a/coresimd/ppsv/api/load_store.rs b/coresimd/ppsv/api/load_store.rs
new file mode 100644
index 0000000000..fe21f74c98
--- /dev/null
+++ b/coresimd/ppsv/api/load_store.rs
@@ -0,0 +1,272 @@
+//! Implements the load/store API.
+
+macro_rules! impl_load_store {
+    ($id:ident, $elem_ty:ident, $elem_count:expr) => {
+        impl $id {
+            /// Writes the values of the vector to the `slice`.
+            ///
+            /// # Panics
+            ///
+            /// If `slice.len() < Self::lanes()` or `&slice[0]` is not
+            /// aligned to an `align_of::<Self>()` boundary.
+            #[inline]
+            pub fn store_aligned(self, slice: &mut [$elem_ty]) {
+                unsafe {
+                    assert!(slice.len() >= $elem_count);
+                    let target_ptr = slice.get_unchecked_mut(0) as *mut $elem_ty;
+                    assert!(target_ptr.align_offset(mem::align_of::<Self>()) == 0);
+                    self.store_aligned_unchecked(slice);
+                }
+            }
+
+            /// Writes the values of the vector to the `slice`.
+            ///
+            /// # Panics
+            ///
+            /// If `slice.len() < Self::lanes()`.
+            #[inline]
+            pub fn store_unaligned(self, slice: &mut [$elem_ty]) {
+                unsafe {
+                    assert!(slice.len() >= $elem_count);
+                    self.store_unaligned_unchecked(slice);
+                }
+            }
+
+            /// Writes the values of the vector to the `slice`.
+            ///
+            /// # Precondition
+            ///
+            /// If `slice.len() < Self::lanes()` or `&slice[0]` is not
+            /// aligned to an `align_of::<Self>()` boundary, the behavior is
+            /// undefined.
+            #[inline]
+            pub unsafe fn store_aligned_unchecked(
+                self,
+                slice: &mut [$elem_ty]
+
+            ) {
+                *(slice.get_unchecked_mut(0) as *mut $elem_ty as *mut Self) = self;
+            }
+
+            /// Writes the values of the vector to the `slice`.
+            ///
+            /// # Precondition
+            ///
+            /// If `slice.len() < Self::lanes()` the behavior is undefined.
+            #[inline]
+            pub unsafe fn store_unaligned_unchecked(
+                self,
+                slice: &mut [$elem_ty]
+            ) {
+                let target_ptr = slice.get_unchecked_mut(0) as *mut $elem_ty as *mut u8;
+                let self_ptr = &self as *const Self as *const u8;
+                ptr::copy_nonoverlapping(self_ptr, target_ptr, mem::size_of::<Self>());
+            }
+
+            /// Instantiates a new vector with the values of the `slice`.
+            ///
+            /// # Panics
+            ///
+            /// If `slice.len() < Self::lanes()` or `&slice[0]` is not aligned
+            /// to an `align_of::<Self>()` boundary.
+            #[inline]
+            pub fn load_aligned(slice: &[$elem_ty]) -> Self {
+                unsafe {
+                    assert!(slice.len() >= $elem_count);
+                    let target_ptr = slice.get_unchecked(0) as *const $elem_ty;
+                    assert!(target_ptr.align_offset(mem::align_of::<Self>()) == 0);
+                    Self::load_aligned_unchecked(slice)
+                }
+            }
+
+            /// Instantiates a new vector with the values of the `slice`.
+            ///
+            /// # Panics
+            ///
+            /// If `slice.len() < Self::lanes()`.
+            #[inline]
+            pub fn load_unaligned(slice: &[$elem_ty]) -> Self {
+                unsafe {
+                    assert!(slice.len() >= $elem_count);
+                    Self::load_unaligned_unchecked(slice)
+                }
+            }
+
+            /// Instantiates a new vector with the values of the `slice`.
+            ///
+            /// # Precondition
+            ///
+            /// If `slice.len() < Self::lanes()` or `&slice[0]` is not aligned
+            /// to an `align_of::<Self>()` boundary, the behavior is undefined.
+            #[inline]
+            pub unsafe fn load_aligned_unchecked(slice: &[$elem_ty]) -> Self {
+                *(slice.get_unchecked(0) as *const $elem_ty as *const Self)
+            }
+
+            /// Instantiates a new vector with the values of the `slice`.
+            ///
+            /// # Precondition
+            ///
+            /// If `slice.len() < Self::lanes()` the behavior is undefined.
+            #[inline]
+            pub unsafe fn load_unaligned_unchecked(slice: &[$elem_ty]) -> Self {
+                use mem::size_of;
+                let target_ptr = slice.get_unchecked(0) as *const $elem_ty as *const u8;
+                let mut x = Self::splat(0 as $elem_ty);
+                let self_ptr = &mut x as *mut Self as *mut u8;
+                ptr::copy_nonoverlapping(target_ptr,self_ptr,size_of::<Self>());
+                x
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+#[macro_export]
+macro_rules! test_load_store {
+    ($id:ident, $elem_ty:ident) => {
+        #[test]
+        fn store_unaligned() {
+            use ::coresimd::simd::$id;
+            use ::std::iter::Iterator;
+            let mut unaligned = [0 as $elem_ty; $id::lanes() + 1];
+            let vec = $id::splat(42 as $elem_ty);
+            vec.store_unaligned(&mut unaligned[1..]);
+            for (index, &b) in unaligned.iter().enumerate() {
+                if index == 0 {
+                    assert_eq!(b, 0 as $elem_ty);
+                } else {
+                    assert_eq!(b, vec.extract(index - 1));
+                }
+            }
+        }
+
+        #[test]
+        #[should_panic]
+        fn store_unaligned_fail() {
+            use ::coresimd::simd::$id;
+            let mut unaligned = [0 as $elem_ty; $id::lanes() + 1];
+            let vec = $id::splat(42 as $elem_ty);
+            vec.store_unaligned(&mut unaligned[2..]);
+        }
+
+        #[test]
+        fn load_unaligned() {
+            use ::coresimd::simd::$id;
+            use ::std::iter::Iterator;
+            let mut unaligned = [42 as $elem_ty; $id::lanes() + 1];
+            unaligned[0] = 0 as $elem_ty;
+            let vec = $id::load_unaligned(&unaligned[1..]);
+            for (index, &b) in unaligned.iter().enumerate() {
+                if index == 0 {
+                    assert_eq!(b, 0 as $elem_ty);
+                } else {
+                    assert_eq!(b, vec.extract(index - 1));
+                }
+            }
+        }
+
+        #[test]
+        #[should_panic]
+        fn load_unaligned_fail() {
+            use ::coresimd::simd::$id;
+            let mut unaligned = [42 as $elem_ty; $id::lanes() + 1];
+            unaligned[0] = 0 as $elem_ty;
+            let _vec = $id::load_unaligned(&unaligned[2..]);
+        }
+
+        union A {
+            data: [$elem_ty; 2 * ::coresimd::simd::$id::lanes()],
+            vec: ::coresimd::simd::$id,
+        }
+
+        #[test]
+        fn store_aligned() {
+            use ::coresimd::simd::$id;
+            use ::std::iter::Iterator;
+            let mut aligned = A { data: [0 as $elem_ty; 2 * $id::lanes()] };
+            let vec = $id::splat(42 as $elem_ty);
+            unsafe { vec.store_aligned(&mut aligned.data[$id::lanes()..]) };
+            for (index, &b) in unsafe { aligned.data.iter().enumerate() } {
+                if index < $id::lanes() {
+                    assert_eq!(b, 0 as $elem_ty);
+                } else {
+                    assert_eq!(b, vec.extract(index - $id::lanes()));
+                }
+            }
+        }
+
+        #[test]
+        #[should_panic]
+        fn store_aligned_fail_lanes() {
+            use ::coresimd::simd::$id;
+            let mut aligned = A { data: [0 as $elem_ty; 2 * $id::lanes()] };
+            let vec = $id::splat(42 as $elem_ty);
+            unsafe { vec.store_aligned(&mut aligned.data[2 * $id::lanes()..]) };
+        }
+
+        #[test]
+        #[should_panic]
+        fn store_aligned_fail_align() {
+            unsafe {
+                use ::coresimd::simd::$id;
+                use ::std::{slice, mem};
+                let mut aligned = A { data: [0 as $elem_ty; 2 * $id::lanes()] };
+                // offset the aligned data by one byte:
+                let s: &mut [u8; 2 * $id::lanes() * mem::size_of::<$elem_ty>()]
+                    = mem::transmute(&mut aligned.data);
+                let s: &mut [$elem_ty] = slice::from_raw_parts_mut(
+                    s.get_unchecked_mut(1) as *mut u8 as *mut $elem_ty,
+                    $id::lanes()
+                );
+                let vec = $id::splat(42 as $elem_ty);
+                vec.store_aligned(s);
+            }
+        }
+
+        #[test]
+        fn load_aligned() {
+            use ::coresimd::simd::$id;
+            use ::std::iter::Iterator;
+            let mut aligned = A { data: [0 as $elem_ty; 2 * $id::lanes()] };
+            for i in $id::lanes()..(2*$id::lanes()) {
+                unsafe { aligned.data[i]  = 42 as $elem_ty; }
+            }
+
+            let vec = unsafe { $id::load_aligned(&aligned.data[$id::lanes()..]) };
+            for (index, &b) in unsafe { aligned.data.iter().enumerate() } {
+                if index < $id::lanes() {
+                    assert_eq!(b, 0 as $elem_ty);
+                } else {
+                    assert_eq!(b, vec.extract(index - $id::lanes()));
+                }
+            }
+        }
+
+        #[test]
+        #[should_panic]
+        fn load_aligned_fail_lanes() {
+            use ::coresimd::simd::$id;
+            let aligned = A { data: [0 as $elem_ty; 2 * $id::lanes()] };
+            let _vec = unsafe { $id::load_aligned(&aligned.data[2 * $id::lanes()..]) };
+        }
+
+        #[test]
+        #[should_panic]
+        fn load_aligned_fail_align() {
+            unsafe {
+                use ::coresimd::simd::$id;
+                use ::std::{slice, mem};
+                let aligned = A { data: [0 as $elem_ty; 2 * $id::lanes()] };
+                // offset the aligned data by one byte:
+                let s: &[u8; 2 * $id::lanes() * mem::size_of::<$elem_ty>()]
+                    = mem::transmute(&aligned.data);
+                let s: &[$elem_ty] = slice::from_raw_parts(
+                    s.get_unchecked(1) as *const u8 as *const $elem_ty,
+                    $id::lanes()
+                );
+                let _vec =  $id::load_aligned(s);
+            }
+        }
+    }
+}
diff --git a/coresimd/ppsv/api/minimal.rs b/coresimd/ppsv/api/minimal.rs
new file mode 100644
index 0000000000..3def265e21
--- /dev/null
+++ b/coresimd/ppsv/api/minimal.rs
@@ -0,0 +1,142 @@
+//!
+
+/// Minimal interface: all packed SIMD vector types implement this.
+macro_rules! impl_minimal {
+    ($id:ident, $elem_ty:ident, $elem_count:expr, $($elem_name:ident),+) => {
+        #[cfg_attr(feature = "cargo-clippy", allow(expl_impl_clone_on_copy))]
+        impl Clone for $id {
+            #[inline] // currently needed for correctness
+            fn clone(&self) -> Self {
+                *self
+            }
+        }
+
+        impl $id {
+            /// Creates a new instance with each vector elements initialized
+            /// with the provided values.
+            #[inline]
+            pub const fn new($($elem_name: $elem_ty),*) -> Self {
+                $id($($elem_name),*)
+            }
+
+            /// Returns the number of vector lanes.
+            #[inline]
+            pub const fn lanes() -> usize {
+                $elem_count
+            }
+
+            /// Constructs a new instance with each element initialized to
+            /// `value`.
+            #[inline]
+            pub const fn splat(value: $elem_ty) -> Self {
+                $id($({
+                    #[allow(non_camel_case_types, dead_code)]
+                    struct $elem_name;
+                    value
+                }),*)
+            }
+
+            /// Extracts the value at `index`.
+            ///
+            /// # Panics
+            ///
+            /// If `index >= Self::lanes()`.
+            #[inline]
+            pub fn extract(self, index: usize) -> $elem_ty {
+                assert!(index < $elem_count);
+                unsafe { self.extract_unchecked(index) }
+            }
+
+            /// Extracts the value at `index`.
+            ///
+            /// If `index >= Self::lanes()` the behavior is undefined.
+            #[inline]
+            pub unsafe fn extract_unchecked(self, index: usize) -> $elem_ty {
+                simd_extract(self, index as u32)
+            }
+
+            /// Returns a new vector where the value at `index` is replaced by `new_value`.
+            ///
+            /// # Panics
+            ///
+            /// If `index >= Self::lanes()`.
+            #[inline]
+            #[must_use = "replace does not modify the original value - it returns a new vector with the value at `index` replaced by `new_value`d"]
+            pub fn replace(self, index: usize, new_value: $elem_ty) -> Self {
+                assert!(index < $elem_count);
+                unsafe { self.replace_unchecked(index, new_value) }
+            }
+
+            /// Returns a new vector where the value at `index` is replaced by `new_value`.
+            ///
+            /// # Panics
+            ///
+            /// If `index >= Self::lanes()`.
+            #[inline]
+            #[must_use = "replace_unchecked does not modify the original value - it returns a new vector with the value at `index` replaced by `new_value`d"]
+            pub unsafe fn replace_unchecked(
+                self,
+                index: usize,
+                new_value: $elem_ty,
+            ) -> Self {
+                simd_insert(self, index as u32, new_value)
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+macro_rules! test_minimal {
+    ($id:ident, $elem_ty:ident, $elem_count:expr) => {
+        #[test]
+        fn minimal() {
+            use ::coresimd::simd::$id;
+            // TODO: test new
+
+            // lanes:
+            assert_eq!($elem_count, $id::lanes());
+
+            // splat and extract / extract_unchecked:
+            const VAL: $elem_ty = 7 as $elem_ty;
+            const VEC: $id = $id::splat(VAL);
+            for i in 0..$id::lanes() {
+                assert_eq!(VAL, VEC.extract(i));
+                assert_eq!(VAL, unsafe { VEC.extract_unchecked(i) });
+            }
+
+            // replace / replace_unchecked
+            let new_vec = VEC.replace(1, 42 as $elem_ty);
+            for i in 0..$id::lanes() {
+                if i == 1 {
+                    assert_eq!(42 as $elem_ty, new_vec.extract(i));
+                } else {
+                    assert_eq!(VAL, new_vec.extract(i));
+                }
+            }
+            let new_vec = unsafe { VEC.replace_unchecked(1, 42 as $elem_ty) };
+            for i in 0..$id::lanes() {
+                if i == 1 {
+                    assert_eq!(42 as $elem_ty, new_vec.extract(i));
+                } else {
+                    assert_eq!(VAL, new_vec.extract(i));
+                }
+            }
+        }
+        #[test]
+        #[should_panic]
+        fn minimal_extract_panic_on_out_of_bounds() {
+            use ::coresimd::simd::$id;
+            const VAL: $elem_ty = 7 as $elem_ty;
+            const VEC: $id = $id::splat(VAL);
+            let _ = VEC.extract($id::lanes());
+        }
+        #[test]
+        #[should_panic]
+        fn minimal_replace_panic_on_out_of_bounds() {
+            use ::coresimd::simd::$id;
+            const VAL: $elem_ty = 7 as $elem_ty;
+            const VEC: $id = $id::splat(VAL);
+            let _ = VEC.replace($id::lanes(), 42 as $elem_ty);
+        }
+    }
+}
diff --git a/coresimd/ppsv/api/minmax_reductions.rs b/coresimd/ppsv/api/minmax_reductions.rs
new file mode 100644
index 0000000000..7c158ab404
--- /dev/null
+++ b/coresimd/ppsv/api/minmax_reductions.rs
@@ -0,0 +1,49 @@
+//! Implements portable arithmetic vector reductions.
+
+macro_rules! impl_minmax_reductions {
+    ($id:ident, $elem_ty:ident) => {
+        impl $id {
+            /// Largest vector value.
+            #[inline]
+            pub fn max(self) -> $elem_ty {
+                ReduceMax::reduce_max(self)
+            }
+            /// Smallest vector value.
+            #[inline]
+            pub fn min(self) -> $elem_ty {
+                ReduceMin::reduce_min(self)
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+macro_rules! test_minmax_reductions {
+    ($id:ident, $elem_ty:ident) => {
+        #[test]
+        fn max() {
+            use ::coresimd::simd::$id;
+            let v = $id::splat(0 as $elem_ty);
+            assert_eq!(v.max(), 0 as $elem_ty);
+            let v = v.replace(1, 1 as $elem_ty);
+            assert_eq!(v.max(), 1 as $elem_ty);
+            let v = v.replace(0, 2 as $elem_ty);
+            assert_eq!(v.max(), 2 as $elem_ty);
+        }
+
+        #[test]
+        fn min() {
+            use ::coresimd::simd::$id;
+            let v = $id::splat(0 as $elem_ty);
+            assert_eq!(v.min(), 0 as $elem_ty);
+            let v = v.replace(1, 1 as $elem_ty);
+            assert_eq!(v.min(), 0 as $elem_ty);
+            let v = $id::splat(1 as $elem_ty);
+            let v = v.replace(0, 2 as $elem_ty);
+            assert_eq!(v.min(), 1 as $elem_ty);
+            let v = $id::splat(2 as $elem_ty);
+            let v = v.replace(1, 1 as $elem_ty);
+            assert_eq!(v.min(), 1 as $elem_ty);
+        }
+    }
+}
diff --git a/coresimd/ppsv/api/mod.rs b/coresimd/ppsv/api/mod.rs
new file mode 100644
index 0000000000..cfd201ed26
--- /dev/null
+++ b/coresimd/ppsv/api/mod.rs
@@ -0,0 +1,282 @@
+//! This module defines the API of portable vector types.
+//!
+//! # API
+//!
+//! ## Traits
+//!
+//! All portable vector types implement the following traits:
+//!
+//! * [x] `Copy`,
+//! * [x] `Clone`,
+//! * [x] `Debug`,
+//! * [x] `Default`
+//! * [x] `PartialEq`
+//! * [x] `PartialOrd` (TODO: re-write in term of
+//!        comparison operations and boolean reductions),
+//!
+//! Non-floating-point vector types also implement:
+//!
+//! * [x] `Hash`,
+//! * [x] `Eq`, and
+//! * [x] `Ord`.
+//!
+//! Integer vector types also implement:
+//!
+//! * [x] `fmt::LowerHex`.
+//!
+//! ## Conversions
+//!
+//! * [x]: `FromBits/IntoBits`: bitwise lossless transmutes between vectors of
+//!        the same size (i.e., same `mem::size_of`).
+//! * [x]: `From/Into`: casts between vectors with the same number of lanes
+//!        (potentially lossy).
+//!
+//! ## Inherent methods
+//!
+//! * [x] minimal API: implemented by all vector types except for boolean
+//!       vectors.
+//! * [x] minimal boolean vector API: implemented by boolean vectors.
+//! * [x] load/store API: aligned and unaligned memory loads and
+//!       stores - implemented by all vectors.
+//! * [x] comparison API: vector lane-wise comparison producing
+//!       boolean vectors - implemented by all vectors.
+//! * [x] arithmetic operations: implemented by all non-boolean vectors.
+//! * [x] `std::ops::Neg`: implemented by signed-integer and floating-point
+//!       vectors.
+//! * [x] bitwise operations: implemented by integer and boolean
+//!       vectors.
+//! * [x] shift operations: implemented by integer vectors.
+//! * [x] arithmetic reductions: implemented by integer and floating-point
+//!       vectors.
+//! * [x] bitwise reductions: implemented by integer and boolean
+//!       vectors.
+//! * [x] boolean reductions: implemented by boolean vectors.
+//! * [ ] portable shuffles: `shufflevector`.
+//! * [ ] portable `gather`/`scatter`:
+
+/// Adds the vector type `$id`, with elements of types `$elem_tys`.
+macro_rules! define_ty {
+    ($id:ident, $($elem_tys:ident),+ | $(#[$doc:meta])*) => {
+        $(#[$doc])*
+            #[repr(simd)]
+        #[derive(Copy, Debug, /*FIXME:*/ PartialOrd)]
+        #[allow(non_camel_case_types)]
+        pub struct $id($($elem_tys),*);
+    }
+}
+
+#[macro_use]
+mod arithmetic_ops;
+#[macro_use]
+mod arithmetic_reductions;
+#[macro_use]
+mod bitwise_ops;
+#[macro_use]
+mod bitwise_reductions;
+#[macro_use]
+mod boolean_reductions;
+#[macro_use]
+mod bool_vectors;
+#[macro_use]
+mod cmp;
+#[macro_use]
+mod default;
+#[macro_use]
+mod eq;
+#[macro_use]
+mod fmt;
+#[macro_use]
+mod from;
+#[macro_use]
+mod from_bits;
+#[macro_use]
+mod hash;
+#[macro_use]
+mod load_store;
+#[macro_use]
+mod minimal;
+#[macro_use]
+mod minmax_reductions;
+#[macro_use]
+mod neg;
+#[macro_use]
+mod partial_eq;
+// TODO:
+//#[macro_use]
+//mod partial_ord;
+// TODO:
+//#[macro_use]
+//mod shuffles;
+// TODO:
+//#[macro_use]
+//mod gather_scatter;
+#[macro_use]
+mod shifts;
+
+/// Imports required to implement vector types using the macros.
+
+macro_rules! simd_api_imports {
+    () => {
+        use ::coresimd::simd_llvm::*;
+        use fmt;
+        use hash;
+        use ops;
+        #[allow(unused_imports)]
+        use num;
+        use cmp::{Eq, PartialEq};
+        use ptr;
+        use mem;
+        #[allow(unused_imports)]
+        use convert::{From, Into};
+        use slice::SliceExt;
+        #[allow(unused_imports)]
+        use iter::Iterator;
+        #[allow(unused_imports)]
+        use default::Default;
+        use clone::Clone;
+        use super::codegen::sum::{ReduceAdd};
+        use super::codegen::product::{ReduceMul};
+        use super::codegen::and::{ReduceAnd};
+        use super::codegen::or::{ReduceOr};
+        use super::codegen::xor::{ReduceXor};
+        use super::codegen::min::{ReduceMin};
+        use super::codegen::max::{ReduceMax};
+    }
+}
+
+/// Defines a portable packed SIMD floating-point vector type.
+macro_rules! simd_f_ty {
+    ($id:ident : $elem_count:expr, $elem_ty:ident, $bool_ty:ident, $test_mod:ident |
+     $($elem_tys:ident),+ | $($elem_name:ident),+ | $(#[$doc:meta])*) => {
+        define_ty!($id, $($elem_tys),+ | $(#[$doc])*);
+        impl_minimal!($id, $elem_ty, $elem_count, $($elem_name),*);
+        impl_load_store!($id, $elem_ty, $elem_count);
+        impl_cmp!($id, $bool_ty);
+        impl_arithmetic_ops!($id);
+        impl_arithmetic_reductions!($id, $elem_ty);
+        impl_minmax_reductions!($id, $elem_ty);
+        impl_neg_op!($id, $elem_ty);
+        impl_partial_eq!($id);
+        impl_default!($id, $elem_ty);
+
+        #[cfg(test)]
+        mod $test_mod {
+            test_minimal!($id, $elem_ty, $elem_count);
+            test_load_store!($id, $elem_ty);
+            test_cmp!($id, $elem_ty, $bool_ty, 1. as $elem_ty, 0. as $elem_ty);
+            test_arithmetic_ops!($id, $elem_ty);
+            test_arithmetic_reductions!($id, $elem_ty);
+            test_minmax_reductions!($id, $elem_ty);
+            test_neg_op!($id, $elem_ty);
+            test_partial_eq!($id, 1. as $elem_ty, 0. as $elem_ty);
+            test_default!($id, $elem_ty);
+        }
+    }
+}
+
+/// Defines a portable packed SIMD signed-integer vector type.
+macro_rules! simd_i_ty {
+    ($id:ident : $elem_count:expr, $elem_ty:ident, $bool_ty:ident, $test_mod:ident |
+     $($elem_tys:ident),+ | $($elem_name:ident),+ | $(#[$doc:meta])*) => {
+        define_ty!($id, $($elem_tys),+ | $(#[$doc])*);
+        impl_minimal!($id, $elem_ty, $elem_count, $($elem_name),*);
+        impl_load_store!($id, $elem_ty, $elem_count);
+        impl_cmp!($id, $bool_ty);
+        impl_hash!($id, $elem_ty);
+        impl_arithmetic_ops!($id);
+        impl_arithmetic_reductions!($id, $elem_ty);
+        impl_minmax_reductions!($id, $elem_ty);
+        impl_neg_op!($id, $elem_ty);
+        impl_bitwise_ops!($id, !(0 as $elem_ty));
+        impl_bitwise_reductions!($id, $elem_ty);
+        impl_all_shifts!($id, $elem_ty);
+        impl_hex_fmt!($id, $elem_ty);
+        impl_eq!($id);
+        impl_partial_eq!($id);
+        impl_default!($id, $elem_ty);
+
+        #[cfg(test)]
+        mod $test_mod {
+            test_minimal!($id, $elem_ty, $elem_count);
+            test_load_store!($id, $elem_ty);
+            test_cmp!($id, $elem_ty, $bool_ty, 1 as $elem_ty, 0 as $elem_ty);
+            test_hash!($id, $elem_ty);
+            test_arithmetic_ops!($id, $elem_ty);
+            test_arithmetic_reductions!($id, $elem_ty);
+            test_minmax_reductions!($id, $elem_ty);
+            test_neg_op!($id, $elem_ty);
+            test_int_bitwise_ops!($id, $elem_ty);
+            test_bitwise_reductions!($id, !(0 as $elem_ty));
+            test_all_shift_ops!($id, $elem_ty);
+            test_hex_fmt!($id, $elem_ty);
+            test_partial_eq!($id, 1 as $elem_ty, 0 as $elem_ty);
+            test_default!($id, $elem_ty);
+        }
+    }
+}
+
+/// Defines a portable packed SIMD unsigned-integer vector type.
+macro_rules! simd_u_ty {
+    ($id:ident : $elem_count:expr, $elem_ty:ident, $bool_ty:ident, $test_mod:ident |
+     $($elem_tys:ident),+ | $($elem_name:ident),+ | $(#[$doc:meta])*) => {
+        define_ty!($id, $($elem_tys),+ | $(#[$doc])*);
+        impl_minimal!($id, $elem_ty, $elem_count, $($elem_name),*);
+        impl_load_store!($id, $elem_ty, $elem_count);
+        impl_cmp!($id, $bool_ty);
+        impl_hash!($id, $elem_ty);
+        impl_arithmetic_ops!($id);
+        impl_arithmetic_reductions!($id, $elem_ty);
+        impl_minmax_reductions!($id, $elem_ty);
+        impl_bitwise_ops!($id, !(0 as $elem_ty));
+        impl_bitwise_reductions!($id, $elem_ty);
+        impl_all_shifts!($id, $elem_ty);
+        impl_hex_fmt!($id, $elem_ty);
+        impl_eq!($id);
+        impl_partial_eq!($id);
+        impl_default!($id, $elem_ty);
+
+        #[cfg(test)]
+        mod $test_mod {
+            test_minimal!($id, $elem_ty, $elem_count);
+            test_load_store!($id, $elem_ty);
+            test_cmp!($id, $elem_ty, $bool_ty, 1 as $elem_ty, 0 as $elem_ty);
+            test_hash!($id, $elem_ty);
+            test_arithmetic_ops!($id, $elem_ty);
+            test_arithmetic_reductions!($id, $elem_ty);
+            test_minmax_reductions!($id, $elem_ty);
+            test_int_bitwise_ops!($id, $elem_ty);
+            test_bitwise_reductions!($id, !(0 as $elem_ty));
+            test_all_shift_ops!($id, $elem_ty);
+            test_hex_fmt!($id, $elem_ty);
+            test_partial_eq!($id, 1 as $elem_ty, 0 as $elem_ty);
+            test_default!($id, $elem_ty);
+        }
+    }
+}
+
+/// Defines a portable packed SIMD boolean vector type.
+macro_rules! simd_b_ty {
+    ($id:ident : $elem_count:expr, $elem_ty:ident, $test_mod:ident |
+     $($elem_tys:ident),+ | $($elem_name:ident),+ | $(#[$doc:meta])*) => {
+        define_ty!($id, $($elem_tys),+ | $(#[$doc])*);
+        impl_bool_minimal!($id, $elem_ty, $elem_count, $($elem_name),*);
+        impl_bitwise_ops!($id, true);
+        impl_bool_bitwise_reductions!($id, bool);
+        impl_bool_reductions!($id);
+        impl_bool_cmp!($id, $id);
+        impl_eq!($id);
+        impl_partial_eq!($id);
+        impl_default!($id, bool);
+
+        #[cfg(test)]
+        mod $test_mod {
+            test_bool_minimal!($id, $elem_count);
+            test_bool_bitwise_ops!($id);
+            test_bool_reductions!($id);
+            test_bitwise_reductions!($id, true);
+            test_cmp!($id, $elem_ty, $id, true, false);
+            test_partial_eq!($id, true, false);
+            test_default!($id, bool);
+        }
+    }
+}
diff --git a/coresimd/ppsv/api/neg.rs b/coresimd/ppsv/api/neg.rs
new file mode 100644
index 0000000000..aa1cffbf7f
--- /dev/null
+++ b/coresimd/ppsv/api/neg.rs
@@ -0,0 +1,43 @@
+//! Implements `std::ops::Neg` for signed vector types.
+
+macro_rules! impl_neg_op {
+    ($id:ident, $elem_ty:ident) => {
+        impl ops::Neg for $id {
+            type Output = Self;
+            #[inline]
+            fn neg(self) -> Self {
+                Self::splat(-1 as $elem_ty) * self
+            }
+        }
+    };
+}
+
+#[cfg(test)]
+#[macro_export]
+macro_rules! test_neg_op {
+    ($id:ident, $elem_ty:ident) => {
+        #[test]
+        fn neg() {
+            use ::coresimd::simd::$id;
+            let z = $id::splat(0 as $elem_ty);
+            let o = $id::splat(1 as $elem_ty);
+            let t = $id::splat(2 as $elem_ty);
+            let f = $id::splat(4 as $elem_ty);
+
+            let nz = $id::splat(-(0 as $elem_ty));
+            let no = $id::splat(-(1 as $elem_ty));
+            let nt = $id::splat(-(2 as $elem_ty));
+            let nf = $id::splat(-(4 as $elem_ty));
+
+            assert_eq!(-z, nz);
+            assert_eq!(-o, no);
+            assert_eq!(-t, nt);
+            assert_eq!(-f, nf);
+
+            assert_eq!(z, -nz);
+            assert_eq!(o, -no);
+            assert_eq!(t, -nt);
+            assert_eq!(f, -nf);
+        }
+    };
+}
diff --git a/coresimd/ppsv/api/partial_eq.rs b/coresimd/ppsv/api/partial_eq.rs
new file mode 100644
index 0000000000..70e7a9f966
--- /dev/null
+++ b/coresimd/ppsv/api/partial_eq.rs
@@ -0,0 +1,35 @@
+//! Implements `PartialEq` for vector types.
+
+macro_rules! impl_partial_eq {
+    ($id:ident) => {
+        impl PartialEq<$id> for $id {
+            #[inline]
+            fn eq(&self, other: &Self) -> bool {
+                $id::eq(*self, *other).all()
+            }
+            #[inline]
+            fn ne(&self, other: &Self) -> bool {
+                $id::ne(*self, *other).all()
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+#[macro_export]
+macro_rules! test_partial_eq {
+    ($id:ident, $true:expr, $false:expr) => {
+        #[test]
+        fn partial_eq() {
+            use ::coresimd::simd::*;
+
+            let a = $id::splat($false);
+            let b = $id::splat($true);
+
+            assert!(a != b);
+            assert!(!(a == b));
+            assert!(a == a);
+            assert!(!(a != a));
+        }
+    }
+}
diff --git a/coresimd/ppsv/api/shifts.rs b/coresimd/ppsv/api/shifts.rs
new file mode 100644
index 0000000000..1447447eea
--- /dev/null
+++ b/coresimd/ppsv/api/shifts.rs
@@ -0,0 +1,122 @@
+//! Implements integer shifts.
+
+macro_rules! impl_shifts {
+    ($id:ident, $elem_ty:ident, $($by:ident),+) => {
+        $(
+            impl ::ops::Shl<$by> for $id {
+                type Output = Self;
+                #[inline]
+                fn shl(self, other: $by) -> Self {
+                    unsafe { simd_shl(self, $id::splat(other as $elem_ty)) }
+                }
+            }
+            impl ::ops::Shr<$by> for $id {
+                type Output = Self;
+                #[inline]
+                fn shr(self, other: $by) -> Self {
+                    unsafe { simd_shr(self, $id::splat(other as $elem_ty)) }
+                }
+            }
+
+            impl ::ops::ShlAssign<$by> for $id {
+                #[inline]
+                fn shl_assign(&mut self, other: $by) {
+                    *self = *self << other;
+                }
+            }
+            impl ::ops::ShrAssign<$by> for $id {
+                #[inline]
+                fn shr_assign(&mut self, other: $by) {
+                    *self = *self >> other;
+                }
+            }
+
+        )+
+    }
+}
+
+macro_rules! impl_all_shifts {
+    ($id:ident, $elem_ty:ident) => {
+        impl_shifts!(
+            $id, $elem_ty,
+            u8, u16, u32, u64, usize,
+            i8, i16, i32, i64, isize);
+
+    }
+}
+
+#[cfg(test)]
+#[macro_export]
+macro_rules! test_shift_ops {
+    ($id:ident, $elem_ty:ident, $($index_ty:ident),+) => {
+        #[test]
+        fn shift_ops() {
+            use ::coresimd::simd::$id;
+            use ::std::mem;
+            let z = $id::splat(0 as $elem_ty);
+            let o = $id::splat(1 as $elem_ty);
+            let t = $id::splat(2 as $elem_ty);
+            let f = $id::splat(4 as $elem_ty);
+
+            $(
+                {
+                    let zi = 0 as $index_ty;
+                    let oi = 1 as $index_ty;
+                    let ti = 2 as $index_ty;
+                    let maxi = (mem::size_of::<$elem_ty>() * 8 - 1) as $index_ty;
+
+                    // shr
+                    assert_eq!(z >> zi, z);
+                    assert_eq!(z >> oi, z);
+                    assert_eq!(z >> ti, z);
+                    assert_eq!(z >> ti, z);
+
+                    assert_eq!(o >> zi, o);
+                    assert_eq!(t >> zi, t);
+                    assert_eq!(f >> zi, f);
+                    assert_eq!(f >> maxi, z);
+
+                    assert_eq!(o >> oi, z);
+                    assert_eq!(t >> oi, o);
+                    assert_eq!(t >> ti, z);
+                    assert_eq!(f >> oi, t);
+                    assert_eq!(f >> ti, o);
+                    assert_eq!(f >> maxi, z);
+
+                    // shl
+                    assert_eq!(z << zi, z);
+                    assert_eq!(o << zi, o);
+                    assert_eq!(t << zi, t);
+                    assert_eq!(f << zi, f);
+                    assert_eq!(f << maxi, z);
+
+                    assert_eq!(o << oi, t);
+                    assert_eq!(o << ti, f);
+                    assert_eq!(t << oi, f);
+
+                    {  // shr_assign
+                        let mut v = o;
+                        v >>= oi;
+                        assert_eq!(v, z);
+                    }
+                    {  // shl_assign
+                        let mut v = o;
+                        v <<= oi;
+                        assert_eq!(v, t);
+                    }
+                }
+            )+
+        }
+    };
+}
+
+#[cfg(test)]
+#[macro_export]
+macro_rules! test_all_shift_ops {
+    ($id:ident, $elem_ty:ident) => {
+        test_shift_ops!(
+            $id, $elem_ty,
+            u8, u16, u32, u64, usize,
+            i8, i16, i32, i64, isize);
+    }
+}
diff --git a/coresimd/ppsv/codegen/and.rs b/coresimd/ppsv/codegen/and.rs
new file mode 100644
index 0000000000..c149a798fd
--- /dev/null
+++ b/coresimd/ppsv/codegen/and.rs
@@ -0,0 +1,170 @@
+//! Code generation for the and reduction.
+use ::coresimd::simd::*;
+
+/// LLVM intrinsics used in the and reduction
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.experimental.vector.reduce.and.i8.v2i8"]
+    fn reduce_and_i8x2(x: i8x2) -> i8;
+    #[link_name = "llvm.experimental.vector.reduce.and.u8.v2u8"]
+    fn reduce_and_u8x2(x: u8x2) -> u8;
+    #[link_name = "llvm.experimental.vector.reduce.and.i16.v2i16"]
+    fn reduce_and_i16x2(x: i16x2) -> i16;
+    #[link_name = "llvm.experimental.vector.reduce.and.u16.v2u16"]
+    fn reduce_and_u16x2(x: u16x2) -> u16;
+    #[link_name = "llvm.experimental.vector.reduce.and.i32.v2i32"]
+    fn reduce_and_i32x2(x: i32x2) -> i32;
+    #[link_name = "llvm.experimental.vector.reduce.and.u32.v2u32"]
+    fn reduce_and_u32x2(x: u32x2) -> u32;
+    #[link_name = "llvm.experimental.vector.reduce.and.i64.v2i64"]
+    fn reduce_and_i64x2(x: i64x2) -> i64;
+    #[link_name = "llvm.experimental.vector.reduce.and.u64.v2u64"]
+    fn reduce_and_u64x2(x: u64x2) -> u64;
+    #[link_name = "llvm.experimental.vector.reduce.and.i8.v4i8"]
+    fn reduce_and_i8x4(x: i8x4) -> i8;
+    #[link_name = "llvm.experimental.vector.reduce.and.u8.v4u8"]
+    fn reduce_and_u8x4(x: u8x4) -> u8;
+    #[link_name = "llvm.experimental.vector.reduce.and.i16.v4i16"]
+    fn reduce_and_i16x4(x: i16x4) -> i16;
+    #[link_name = "llvm.experimental.vector.reduce.and.u16.v4u16"]
+    fn reduce_and_u16x4(x: u16x4) -> u16;
+    #[link_name = "llvm.experimental.vector.reduce.and.i32.v4i32"]
+    fn reduce_and_i32x4(x: i32x4) -> i32;
+    #[link_name = "llvm.experimental.vector.reduce.and.u32.v4u32"]
+    fn reduce_and_u32x4(x: u32x4) -> u32;
+    #[link_name = "llvm.experimental.vector.reduce.and.i64.v4i64"]
+    fn reduce_and_i64x4(x: i64x4) -> i64;
+    #[link_name = "llvm.experimental.vector.reduce.and.u64.v4u64"]
+    fn reduce_and_u64x4(x: u64x4) -> u64;
+    #[link_name = "llvm.experimental.vector.reduce.and.i8.v8i8"]
+    fn reduce_and_i8x8(x: i8x8) -> i8;
+    #[link_name = "llvm.experimental.vector.reduce.and.u8.v8u8"]
+    fn reduce_and_u8x8(x: u8x8) -> u8;
+    #[link_name = "llvm.experimental.vector.reduce.and.i16.v8i16"]
+    fn reduce_and_i16x8(x: i16x8) -> i16;
+    #[link_name = "llvm.experimental.vector.reduce.and.u16.v8u16"]
+    fn reduce_and_u16x8(x: u16x8) -> u16;
+    #[link_name = "llvm.experimental.vector.reduce.and.i32.v8i32"]
+    fn reduce_and_i32x8(x: i32x8) -> i32;
+    #[link_name = "llvm.experimental.vector.reduce.and.u32.v8u32"]
+    fn reduce_and_u32x8(x: u32x8) -> u32;
+    #[link_name = "llvm.experimental.vector.reduce.and.i64.v8i64"]
+    fn reduce_and_i64x8(x: i64x8) -> i64;
+    #[link_name = "llvm.experimental.vector.reduce.and.u64.v8u64"]
+    fn reduce_and_u64x8(x: u64x8) -> u64;
+    #[link_name = "llvm.experimental.vector.reduce.and.i8.v16i8"]
+    fn reduce_and_i8x16(x: i8x16) -> i8;
+    #[link_name = "llvm.experimental.vector.reduce.and.u8.v16u8"]
+    fn reduce_and_u8x16(x: u8x16) -> u8;
+    #[link_name = "llvm.experimental.vector.reduce.and.i16.v16i16"]
+    fn reduce_and_i16x16(x: i16x16) -> i16;
+    #[link_name = "llvm.experimental.vector.reduce.and.u16.v16u16"]
+    fn reduce_and_u16x16(x: u16x16) -> u16;
+    #[link_name = "llvm.experimental.vector.reduce.and.i32.v16i32"]
+    fn reduce_and_i32x16(x: i32x16) -> i32;
+    #[link_name = "llvm.experimental.vector.reduce.and.u32.v16u32"]
+    fn reduce_and_u32x16(x: u32x16) -> u32;
+    #[link_name = "llvm.experimental.vector.reduce.and.i8.v32i8"]
+    fn reduce_and_i8x32(x: i8x32) -> i8;
+    #[link_name = "llvm.experimental.vector.reduce.and.u8.v32u8"]
+    fn reduce_and_u8x32(x: u8x32) -> u8;
+    #[link_name = "llvm.experimental.vector.reduce.and.i16.v32i16"]
+    fn reduce_and_i16x32(x: i16x32) -> i16;
+    #[link_name = "llvm.experimental.vector.reduce.and.u16.v32u16"]
+    fn reduce_and_u16x32(x: u16x32) -> u16;
+    #[link_name = "llvm.experimental.vector.reduce.and.i8.v64i8"]
+    fn reduce_and_i8x64(x: i8x64) -> i8;
+    #[link_name = "llvm.experimental.vector.reduce.and.u8.v64u8"]
+    fn reduce_and_u8x64(x: u8x64) -> u8;
+}
+
+/// Reduction: horizontal bitwise and of the vector elements.
+#[cfg_attr(feature = "cargo-clippy", allow(stutter))]
+pub trait ReduceAnd {
+    /// Result type of the reduction.
+    type Acc;
+    /// Computes the horizontal bitwise and of the vector elements
+    fn reduce_and(self) -> Self::Acc;
+}
+
+macro_rules! red_and {
+    ($id:ident, $elem_ty:ident, $llvm_intr:ident) => {
+        impl ReduceAnd for $id {
+            type Acc = $elem_ty;
+            #[cfg(not(target_arch = "aarch64"))]
+            #[inline]
+            fn reduce_and(self) -> Self::Acc {
+                unsafe { $llvm_intr(self.into_bits()) }
+            }
+            // FIXME: broken in AArch64
+            #[cfg(target_arch = "aarch64")]
+            #[inline]
+            fn reduce_and(self) -> Self::Acc {
+                let mut x = self.extract(0) as Self::Acc;
+                for i in 1..$id::lanes() {
+                    x &= self.extract(i) as Self::Acc;
+                }
+                x
+            }
+        }
+    };
+}
+red_and!(i8x2, i8, reduce_and_i8x2);
+red_and!(u8x2, u8, reduce_and_u8x2);
+red_and!(i16x2, i16, reduce_and_i16x2);
+red_and!(u16x2, u16, reduce_and_u16x2);
+red_and!(i32x2, i32, reduce_and_i32x2);
+red_and!(u32x2, u32, reduce_and_u32x2);
+red_and!(i64x2, i64, reduce_and_i64x2);
+red_and!(u64x2, u64, reduce_and_u64x2);
+red_and!(i8x4, i8, reduce_and_i8x4);
+red_and!(u8x4, u8, reduce_and_u8x4);
+red_and!(i16x4, i16, reduce_and_i16x4);
+red_and!(u16x4, u16, reduce_and_u16x4);
+red_and!(i32x4, i32, reduce_and_i32x4);
+red_and!(u32x4, u32, reduce_and_u32x4);
+red_and!(i64x4, i64, reduce_and_i64x4);
+red_and!(u64x4, u64, reduce_and_u64x4);
+red_and!(i8x8, i8, reduce_and_i8x8);
+red_and!(u8x8, u8, reduce_and_u8x8);
+red_and!(i16x8, i16, reduce_and_i16x8);
+red_and!(u16x8, u16, reduce_and_u16x8);
+red_and!(i32x8, i32, reduce_and_i32x8);
+red_and!(u32x8, u32, reduce_and_u32x8);
+red_and!(i64x8, i64, reduce_and_i64x8);
+red_and!(u64x8, u64, reduce_and_u64x8);
+red_and!(i8x16, i8, reduce_and_i8x16);
+red_and!(u8x16, u8, reduce_and_u8x16);
+red_and!(i16x16, i16, reduce_and_i16x16);
+red_and!(u16x16, u16, reduce_and_u16x16);
+red_and!(i32x16, i32, reduce_and_i32x16);
+red_and!(u32x16, u32, reduce_and_u32x16);
+red_and!(i8x32, i8, reduce_and_i8x32);
+red_and!(u8x32, u8, reduce_and_u8x32);
+red_and!(i16x32, i16, reduce_and_i16x32);
+red_and!(u16x32, u16, reduce_and_u16x32);
+red_and!(i8x64, i8, reduce_and_i8x64);
+red_and!(u8x64, u8, reduce_and_u8x64);
+
+red_and!(b8x2, i8, reduce_and_i8x2);
+red_and!(b8x4, i8, reduce_and_i8x4);
+red_and!(b8x8, i8, reduce_and_i8x8);
+red_and!(b8x16, i8, reduce_and_i8x16);
+red_and!(b8x32, i8, reduce_and_i8x32);
+red_and!(b8x64, i8, reduce_and_i8x64);
+
+#[cfg(test)]
+mod tests {
+    use super::ReduceAnd;
+    use ::coresimd::simd::*;
+
+    // note: these are tested in the portable vector API tests
+
+    #[test]
+    fn reduce_and_i32x4() {
+        let v = i32x4::splat(1);
+        assert_eq!(v.reduce_and(), 1_i32);
+        let v = i32x4::new(1, 1, 0, 1);
+        assert_eq!(v.reduce_and(), 0_i32);
+    }
+}
diff --git a/coresimd/ppsv/codegen/max.rs b/coresimd/ppsv/codegen/max.rs
new file mode 100644
index 0000000000..7ec4884b5a
--- /dev/null
+++ b/coresimd/ppsv/codegen/max.rs
@@ -0,0 +1,196 @@
+//! Code generation for the max reduction.
+use ::coresimd::simd::*;
+
+/// LLVM intrinsics used in the max reduction
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.experimental.vector.reduce.smax.i8.v2i8"]
+    fn reduce_max_i8x2(x: i8x2) -> i8;
+    #[link_name = "llvm.experimental.vector.reduce.umax.u8.v2u8"]
+    fn reduce_max_u8x2(x: u8x2) -> u8;
+    #[link_name = "llvm.experimental.vector.reduce.smax.i16.v2i16"]
+    fn reduce_max_i16x2(x: i16x2) -> i16;
+    #[link_name = "llvm.experimental.vector.reduce.umax.u16.v2u16"]
+    fn reduce_max_u16x2(x: u16x2) -> u16;
+    #[link_name = "llvm.experimental.vector.reduce.smax.i32.v2i32"]
+    fn reduce_max_i32x2(x: i32x2) -> i32;
+    #[link_name = "llvm.experimental.vector.reduce.umax.u32.v2u32"]
+    fn reduce_max_u32x2(x: u32x2) -> u32;
+    #[link_name = "llvm.experimental.vector.reduce.smax.i64.v2i64"]
+    fn reduce_max_i64x2(x: i64x2) -> i64;
+    #[link_name = "llvm.experimental.vector.reduce.umax.u64.v2u64"]
+    fn reduce_max_u64x2(x: u64x2) -> u64;
+    #[link_name = "llvm.experimental.vector.reduce.smax.i8.v4i8"]
+    fn reduce_max_i8x4(x: i8x4) -> i8;
+    #[link_name = "llvm.experimental.vector.reduce.umax.u8.v4u8"]
+    fn reduce_max_u8x4(x: u8x4) -> u8;
+    #[link_name = "llvm.experimental.vector.reduce.smax.i16.v4i16"]
+    fn reduce_max_i16x4(x: i16x4) -> i16;
+    #[link_name = "llvm.experimental.vector.reduce.umax.u16.v4u16"]
+    fn reduce_max_u16x4(x: u16x4) -> u16;
+    #[link_name = "llvm.experimental.vector.reduce.smax.i32.v4i32"]
+    fn reduce_max_i32x4(x: i32x4) -> i32;
+    #[link_name = "llvm.experimental.vector.reduce.umax.u32.v4u32"]
+    fn reduce_max_u32x4(x: u32x4) -> u32;
+    #[link_name = "llvm.experimental.vector.reduce.smax.i64.v4i64"]
+    fn reduce_max_i64x4(x: i64x4) -> i64;
+    #[link_name = "llvm.experimental.vector.reduce.umax.u64.v4u64"]
+    fn reduce_max_u64x4(x: u64x4) -> u64;
+    #[link_name = "llvm.experimental.vector.reduce.smax.i8.v8i8"]
+    fn reduce_max_i8x8(x: i8x8) -> i8;
+    #[link_name = "llvm.experimental.vector.reduce.umax.u8.v8u8"]
+    fn reduce_max_u8x8(x: u8x8) -> u8;
+    #[link_name = "llvm.experimental.vector.reduce.smax.i16.v8i16"]
+    fn reduce_max_i16x8(x: i16x8) -> i16;
+    #[link_name = "llvm.experimental.vector.reduce.umax.u16.v8u16"]
+    fn reduce_max_u16x8(x: u16x8) -> u16;
+    #[link_name = "llvm.experimental.vector.reduce.smax.i32.v8i32"]
+    fn reduce_max_i32x8(x: i32x8) -> i32;
+    #[link_name = "llvm.experimental.vector.reduce.umax.u32.v8u32"]
+    fn reduce_max_u32x8(x: u32x8) -> u32;
+    #[link_name = "llvm.experimental.vector.reduce.smax.i64.v8i64"]
+    fn reduce_max_i64x8(x: i64x8) -> i64;
+    #[link_name = "llvm.experimental.vector.reduce.umax.u64.v8u64"]
+    fn reduce_max_u64x8(x: u64x8) -> u64;
+    #[link_name = "llvm.experimental.vector.reduce.smax.i8.v16i8"]
+    fn reduce_max_i8x16(x: i8x16) -> i8;
+    #[link_name = "llvm.experimental.vector.reduce.umax.u8.v16u8"]
+    fn reduce_max_u8x16(x: u8x16) -> u8;
+    #[link_name = "llvm.experimental.vector.reduce.smax.i16.v16i16"]
+    fn reduce_max_i16x16(x: i16x16) -> i16;
+    #[link_name = "llvm.experimental.vector.reduce.umax.u16.v16u16"]
+    fn reduce_max_u16x16(x: u16x16) -> u16;
+    #[link_name = "llvm.experimental.vector.reduce.smax.i32.v16i32"]
+    fn reduce_max_i32x16(x: i32x16) -> i32;
+    #[link_name = "llvm.experimental.vector.reduce.umax.u32.v16u32"]
+    fn reduce_max_u32x16(x: u32x16) -> u32;
+    #[link_name = "llvm.experimental.vector.reduce.smax.i8.v32i8"]
+    fn reduce_max_i8x32(x: i8x32) -> i8;
+    #[link_name = "llvm.experimental.vector.reduce.umax.u8.v32u8"]
+    fn reduce_max_u8x32(x: u8x32) -> u8;
+    #[link_name = "llvm.experimental.vector.reduce.smax.i16.v32i16"]
+    fn reduce_max_i16x32(x: i16x32) -> i16;
+    #[link_name = "llvm.experimental.vector.reduce.umax.u16.v32u16"]
+    fn reduce_max_u16x32(x: u16x32) -> u16;
+    #[link_name = "llvm.experimental.vector.reduce.smax.i8.v64i8"]
+    fn reduce_max_i8x64(x: i8x64) -> i8;
+    #[link_name = "llvm.experimental.vector.reduce.umax.u8.v64u8"]
+    fn reduce_max_u8x64(x: u8x64) -> u8;
+    #[link_name = "llvm.experimental.vector.reduce.fmax.f32.v2f32"]
+    fn reduce_fmax_f32x2(x: f32x2) -> f32;
+    #[link_name = "llvm.experimental.vector.reduce.fmax.f64.v2f64"]
+    fn reduce_fmax_f64x2(x: f64x2) -> f64;
+    #[link_name = "llvm.experimental.vector.reduce.fmax.f32.v4f32"]
+    fn reduce_fmax_f32x4(x: f32x4) -> f32;
+    #[link_name = "llvm.experimental.vector.reduce.fmax.f64.v4f64"]
+    fn reduce_fmax_f64x4(x: f64x4) -> f64;
+    #[link_name = "llvm.experimental.vector.reduce.fmax.f32.v8f32"]
+    fn reduce_fmax_f32x8(x: f32x8) -> f32;
+    #[link_name = "llvm.experimental.vector.reduce.fmax.f64.v8f64"]
+    fn reduce_fmax_f64x8(x: f64x8) -> f64;
+    #[link_name = "llvm.experimental.vector.reduce.fmax.f32.v16f32"]
+    fn reduce_fmax_f32x16(x: f32x16) -> f32;
+}
+
+/// Reduction: horizontal max of the vector elements.
+#[cfg_attr(feature = "cargo-clippy", allow(stutter))]
+pub trait ReduceMax {
+    /// Result type of the reduction.
+    type Acc;
+    /// Computes the horizontal max of the vector elements.
+    fn reduce_max(self) -> Self::Acc;
+}
+
+macro_rules! red_max {
+    ($id:ident, $elem_ty:ident, $llvm_intr:ident) => {
+        impl ReduceMax for $id {
+            type Acc = $elem_ty;
+            #[cfg(not(target_arch = "aarch64"))]
+            #[inline]
+            fn reduce_max(self) -> Self::Acc {
+                unsafe { $llvm_intr(self) }
+            }
+            // FIXME: broken on AArch64
+            #[cfg(target_arch = "aarch64")]
+            #[allow(unused_imports)]
+            #[inline]
+            fn reduce_max(self) -> Self::Acc {
+                use num::Float;
+                use cmp::Ord;
+                let mut x = self.extract(0);
+                for i in 1..$id::lanes() {
+                    x = x.max(self.extract(i));
+                }
+                x
+            }
+        }
+    };
+}
+red_max!(i8x2, i8, reduce_max_i8x2);
+red_max!(u8x2, u8, reduce_max_u8x2);
+red_max!(i16x2, i16, reduce_max_i16x2);
+red_max!(u16x2, u16, reduce_max_u16x2);
+red_max!(i32x2, i32, reduce_max_i32x2);
+red_max!(u32x2, u32, reduce_max_u32x2);
+red_max!(i64x2, i64, reduce_max_i64x2);
+red_max!(u64x2, u64, reduce_max_u64x2);
+red_max!(i8x4, i8, reduce_max_i8x4);
+red_max!(u8x4, u8, reduce_max_u8x4);
+red_max!(i16x4, i16, reduce_max_i16x4);
+red_max!(u16x4, u16, reduce_max_u16x4);
+red_max!(i32x4, i32, reduce_max_i32x4);
+red_max!(u32x4, u32, reduce_max_u32x4);
+red_max!(i64x4, i64, reduce_max_i64x4);
+red_max!(u64x4, u64, reduce_max_u64x4);
+red_max!(i8x8, i8, reduce_max_i8x8);
+red_max!(u8x8, u8, reduce_max_u8x8);
+red_max!(i16x8, i16, reduce_max_i16x8);
+red_max!(u16x8, u16, reduce_max_u16x8);
+red_max!(i32x8, i32, reduce_max_i32x8);
+red_max!(u32x8, u32, reduce_max_u32x8);
+red_max!(i64x8, i64, reduce_max_i64x8);
+red_max!(u64x8, u64, reduce_max_u64x8);
+red_max!(i8x16, i8, reduce_max_i8x16);
+red_max!(u8x16, u8, reduce_max_u8x16);
+red_max!(i16x16, i16, reduce_max_i16x16);
+red_max!(u16x16, u16, reduce_max_u16x16);
+red_max!(i32x16, i32, reduce_max_i32x16);
+red_max!(u32x16, u32, reduce_max_u32x16);
+red_max!(i8x32, i8, reduce_max_i8x32);
+red_max!(u8x32, u8, reduce_max_u8x32);
+red_max!(i16x32, i16, reduce_max_i16x32);
+red_max!(u16x32, u16, reduce_max_u16x32);
+red_max!(i8x64, i8, reduce_max_i8x64);
+red_max!(u8x64, u8, reduce_max_u8x64);
+
+red_max!(f32x2, f32, reduce_fmax_f32x2);
+red_max!(f64x2, f64, reduce_fmax_f64x2);
+red_max!(f32x4, f32, reduce_fmax_f32x4);
+red_max!(f64x4, f64, reduce_fmax_f64x4);
+red_max!(f32x8, f32, reduce_fmax_f32x8);
+red_max!(f64x8, f64, reduce_fmax_f64x8);
+red_max!(f32x16, f32, reduce_fmax_f32x16);
+
+#[cfg(test)]
+mod tests {
+    use super::ReduceMax;
+    use ::coresimd::simd::*;
+
+    // note: these are tested in the portable vector API tests
+
+    #[test]
+    fn reduce_max_i32x4() {
+        let v = i32x4::new(1, 2, -1, 3);
+        assert_eq!(v.reduce_max(), 3_i32);
+    }
+    #[test]
+    fn reduce_max_u32x4() {
+        let v = u32x4::new(4, 2, 7, 3);
+        assert_eq!(v.reduce_max(), 7_u32);
+    }
+    #[test]
+    fn reduce_max_f32x4() {
+        let v = f32x4::new(4., 2., -1., 3.);
+        assert_eq!(v.reduce_max(), 4.);
+    }
+}
diff --git a/coresimd/ppsv/codegen/min.rs b/coresimd/ppsv/codegen/min.rs
new file mode 100644
index 0000000000..66e53a7057
--- /dev/null
+++ b/coresimd/ppsv/codegen/min.rs
@@ -0,0 +1,196 @@
+//! Code generation for the min reduction.
+use ::coresimd::simd::*;
+
+/// LLVM intrinsics used in the min reduction
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.experimental.vector.reduce.smin.i8.v2i8"]
+    fn reduce_min_i8x2(x: i8x2) -> i8;
+    #[link_name = "llvm.experimental.vector.reduce.umin.u8.v2u8"]
+    fn reduce_min_u8x2(x: u8x2) -> u8;
+    #[link_name = "llvm.experimental.vector.reduce.smin.i16.v2i16"]
+    fn reduce_min_i16x2(x: i16x2) -> i16;
+    #[link_name = "llvm.experimental.vector.reduce.umin.u16.v2u16"]
+    fn reduce_min_u16x2(x: u16x2) -> u16;
+    #[link_name = "llvm.experimental.vector.reduce.smin.i32.v2i32"]
+    fn reduce_min_i32x2(x: i32x2) -> i32;
+    #[link_name = "llvm.experimental.vector.reduce.umin.u32.v2u32"]
+    fn reduce_min_u32x2(x: u32x2) -> u32;
+    #[link_name = "llvm.experimental.vector.reduce.smin.i64.v2i64"]
+    fn reduce_min_i64x2(x: i64x2) -> i64;
+    #[link_name = "llvm.experimental.vector.reduce.umin.u64.v2u64"]
+    fn reduce_min_u64x2(x: u64x2) -> u64;
+    #[link_name = "llvm.experimental.vector.reduce.smin.i8.v4i8"]
+    fn reduce_min_i8x4(x: i8x4) -> i8;
+    #[link_name = "llvm.experimental.vector.reduce.umin.u8.v4u8"]
+    fn reduce_min_u8x4(x: u8x4) -> u8;
+    #[link_name = "llvm.experimental.vector.reduce.smin.i16.v4i16"]
+    fn reduce_min_i16x4(x: i16x4) -> i16;
+    #[link_name = "llvm.experimental.vector.reduce.umin.u16.v4u16"]
+    fn reduce_min_u16x4(x: u16x4) -> u16;
+    #[link_name = "llvm.experimental.vector.reduce.smin.i32.v4i32"]
+    fn reduce_min_i32x4(x: i32x4) -> i32;
+    #[link_name = "llvm.experimental.vector.reduce.umin.u32.v4u32"]
+    fn reduce_min_u32x4(x: u32x4) -> u32;
+    #[link_name = "llvm.experimental.vector.reduce.smin.i64.v4i64"]
+    fn reduce_min_i64x4(x: i64x4) -> i64;
+    #[link_name = "llvm.experimental.vector.reduce.umin.u64.v4u64"]
+    fn reduce_min_u64x4(x: u64x4) -> u64;
+    #[link_name = "llvm.experimental.vector.reduce.smin.i8.v8i8"]
+    fn reduce_min_i8x8(x: i8x8) -> i8;
+    #[link_name = "llvm.experimental.vector.reduce.umin.u8.v8u8"]
+    fn reduce_min_u8x8(x: u8x8) -> u8;
+    #[link_name = "llvm.experimental.vector.reduce.smin.i16.v8i16"]
+    fn reduce_min_i16x8(x: i16x8) -> i16;
+    #[link_name = "llvm.experimental.vector.reduce.umin.u16.v8u16"]
+    fn reduce_min_u16x8(x: u16x8) -> u16;
+    #[link_name = "llvm.experimental.vector.reduce.smin.i32.v8i32"]
+    fn reduce_min_i32x8(x: i32x8) -> i32;
+    #[link_name = "llvm.experimental.vector.reduce.umin.u32.v8u32"]
+    fn reduce_min_u32x8(x: u32x8) -> u32;
+    #[link_name = "llvm.experimental.vector.reduce.smin.i64.v8i64"]
+    fn reduce_min_i64x8(x: i64x8) -> i64;
+    #[link_name = "llvm.experimental.vector.reduce.umin.u64.v8u64"]
+    fn reduce_min_u64x8(x: u64x8) -> u64;
+    #[link_name = "llvm.experimental.vector.reduce.smin.i8.v16i8"]
+    fn reduce_min_i8x16(x: i8x16) -> i8;
+    #[link_name = "llvm.experimental.vector.reduce.umin.u8.v16u8"]
+    fn reduce_min_u8x16(x: u8x16) -> u8;
+    #[link_name = "llvm.experimental.vector.reduce.smin.i16.v16i16"]
+    fn reduce_min_i16x16(x: i16x16) -> i16;
+    #[link_name = "llvm.experimental.vector.reduce.umin.u16.v16u16"]
+    fn reduce_min_u16x16(x: u16x16) -> u16;
+    #[link_name = "llvm.experimental.vector.reduce.smin.i32.v16i32"]
+    fn reduce_min_i32x16(x: i32x16) -> i32;
+    #[link_name = "llvm.experimental.vector.reduce.umin.u32.v16u32"]
+    fn reduce_min_u32x16(x: u32x16) -> u32;
+    #[link_name = "llvm.experimental.vector.reduce.smin.i8.v32i8"]
+    fn reduce_min_i8x32(x: i8x32) -> i8;
+    #[link_name = "llvm.experimental.vector.reduce.umin.u8.v32u8"]
+    fn reduce_min_u8x32(x: u8x32) -> u8;
+    #[link_name = "llvm.experimental.vector.reduce.smin.i16.v32i16"]
+    fn reduce_min_i16x32(x: i16x32) -> i16;
+    #[link_name = "llvm.experimental.vector.reduce.umin.u16.v32u16"]
+    fn reduce_min_u16x32(x: u16x32) -> u16;
+    #[link_name = "llvm.experimental.vector.reduce.smin.i8.v64i8"]
+    fn reduce_min_i8x64(x: i8x64) -> i8;
+    #[link_name = "llvm.experimental.vector.reduce.umin.u8.v64u8"]
+    fn reduce_min_u8x64(x: u8x64) -> u8;
+    #[link_name = "llvm.experimental.vector.reduce.fmin.f32.v2f32"]
+    fn reduce_fmin_f32x2(x: f32x2) -> f32;
+    #[link_name = "llvm.experimental.vector.reduce.fmin.f64.v2f64"]
+    fn reduce_fmin_f64x2(x: f64x2) -> f64;
+    #[link_name = "llvm.experimental.vector.reduce.fmin.f32.v4f32"]
+    fn reduce_fmin_f32x4(x: f32x4) -> f32;
+    #[link_name = "llvm.experimental.vector.reduce.fmin.f64.v4f64"]
+    fn reduce_fmin_f64x4(x: f64x4) -> f64;
+    #[link_name = "llvm.experimental.vector.reduce.fmin.f32.v8f32"]
+    fn reduce_fmin_f32x8(x: f32x8) -> f32;
+    #[link_name = "llvm.experimental.vector.reduce.fmin.f64.v8f64"]
+    fn reduce_fmin_f64x8(x: f64x8) -> f64;
+    #[link_name = "llvm.experimental.vector.reduce.fmin.f32.v16f32"]
+    fn reduce_fmin_f32x16(x: f32x16) -> f32;
+}
+
+/// Reduction: horizontal max of the vector elements.
+#[cfg_attr(feature = "cargo-clippy", allow(stutter))]
+pub trait ReduceMin {
+    /// Result type of the reduction.
+    type Acc;
+    /// Computes the horizontal max of the vector elements.
+    fn reduce_min(self) -> Self::Acc;
+}
+
+macro_rules! red_min {
+    ($id:ident, $elem_ty:ident, $llvm_intr:ident) => {
+        impl ReduceMin for $id {
+            type Acc = $elem_ty;
+            #[cfg(not(target_arch = "aarch64"))]
+            #[inline]
+            fn reduce_min(self) -> Self::Acc {
+                unsafe { $llvm_intr(self) }
+            }
+            // FIXME: broken on AArch64
+            #[cfg(target_arch = "aarch64")]
+            #[allow(unused_imports)]
+            #[inline]
+            fn reduce_min(self) -> Self::Acc {
+                use num::Float;
+                use cmp::Ord;
+                let mut x = self.extract(0);
+                for i in 1..$id::lanes() {
+                    x = x.min(self.extract(i));
+                }
+                x
+            }
+        }
+    };
+}
+red_min!(i8x2, i8, reduce_min_i8x2);
+red_min!(u8x2, u8, reduce_min_u8x2);
+red_min!(i16x2, i16, reduce_min_i16x2);
+red_min!(u16x2, u16, reduce_min_u16x2);
+red_min!(i32x2, i32, reduce_min_i32x2);
+red_min!(u32x2, u32, reduce_min_u32x2);
+red_min!(i64x2, i64, reduce_min_i64x2);
+red_min!(u64x2, u64, reduce_min_u64x2);
+red_min!(i8x4, i8, reduce_min_i8x4);
+red_min!(u8x4, u8, reduce_min_u8x4);
+red_min!(i16x4, i16, reduce_min_i16x4);
+red_min!(u16x4, u16, reduce_min_u16x4);
+red_min!(i32x4, i32, reduce_min_i32x4);
+red_min!(u32x4, u32, reduce_min_u32x4);
+red_min!(i64x4, i64, reduce_min_i64x4);
+red_min!(u64x4, u64, reduce_min_u64x4);
+red_min!(i8x8, i8, reduce_min_i8x8);
+red_min!(u8x8, u8, reduce_min_u8x8);
+red_min!(i16x8, i16, reduce_min_i16x8);
+red_min!(u16x8, u16, reduce_min_u16x8);
+red_min!(i32x8, i32, reduce_min_i32x8);
+red_min!(u32x8, u32, reduce_min_u32x8);
+red_min!(i64x8, i64, reduce_min_i64x8);
+red_min!(u64x8, u64, reduce_min_u64x8);
+red_min!(i8x16, i8, reduce_min_i8x16);
+red_min!(u8x16, u8, reduce_min_u8x16);
+red_min!(i16x16, i16, reduce_min_i16x16);
+red_min!(u16x16, u16, reduce_min_u16x16);
+red_min!(i32x16, i32, reduce_min_i32x16);
+red_min!(u32x16, u32, reduce_min_u32x16);
+red_min!(i8x32, i8, reduce_min_i8x32);
+red_min!(u8x32, u8, reduce_min_u8x32);
+red_min!(i16x32, i16, reduce_min_i16x32);
+red_min!(u16x32, u16, reduce_min_u16x32);
+red_min!(i8x64, i8, reduce_min_i8x64);
+red_min!(u8x64, u8, reduce_min_u8x64);
+
+red_min!(f32x2, f32, reduce_fmin_f32x2);
+red_min!(f64x2, f64, reduce_fmin_f64x2);
+red_min!(f32x4, f32, reduce_fmin_f32x4);
+red_min!(f64x4, f64, reduce_fmin_f64x4);
+red_min!(f32x8, f32, reduce_fmin_f32x8);
+red_min!(f64x8, f64, reduce_fmin_f64x8);
+red_min!(f32x16, f32, reduce_fmin_f32x16);
+
+#[cfg(test)]
+mod tests {
+    use super::ReduceMin;
+    use ::coresimd::simd::*;
+
+    // note: these are tested in the portable vector API tests
+
+    #[test]
+    fn reduce_min_i32x4() {
+        let v = i32x4::new(1, 2, -1, 3);
+        assert_eq!(v.reduce_min(), -1_i32);
+    }
+    #[test]
+    fn reduce_min_u32x4() {
+        let v = u32x4::new(4, 2, 7, 3);
+        assert_eq!(v.reduce_min(), 2_u32);
+    }
+    #[test]
+    fn reduce_min_f32x4() {
+        let v = f32x4::new(4., 2., -1., 3.);
+        assert_eq!(v.reduce_min(), -1.);
+    }
+}
diff --git a/coresimd/ppsv/codegen/mod.rs b/coresimd/ppsv/codegen/mod.rs
new file mode 100644
index 0000000000..26beb14563
--- /dev/null
+++ b/coresimd/ppsv/codegen/mod.rs
@@ -0,0 +1,9 @@
+//! Code Generation
+
+pub mod sum;
+pub mod product;
+pub mod and;
+pub mod or;
+pub mod xor;
+pub mod min;
+pub mod max;
diff --git a/coresimd/ppsv/codegen/or.rs b/coresimd/ppsv/codegen/or.rs
new file mode 100644
index 0000000000..007d8f9e11
--- /dev/null
+++ b/coresimd/ppsv/codegen/or.rs
@@ -0,0 +1,170 @@
+//! Code generation for the or reduction.
+use ::coresimd::simd::*;
+
+/// LLVM intrinsics used in the or reduction
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.experimental.vector.reduce.or.i8.v2i8"]
+    fn reduce_or_i8x2(x: i8x2) -> i8;
+    #[link_name = "llvm.experimental.vector.reduce.or.u8.v2u8"]
+    fn reduce_or_u8x2(x: u8x2) -> u8;
+    #[link_name = "llvm.experimental.vector.reduce.or.i16.v2i16"]
+    fn reduce_or_i16x2(x: i16x2) -> i16;
+    #[link_name = "llvm.experimental.vector.reduce.or.u16.v2u16"]
+    fn reduce_or_u16x2(x: u16x2) -> u16;
+    #[link_name = "llvm.experimental.vector.reduce.or.i32.v2i32"]
+    fn reduce_or_i32x2(x: i32x2) -> i32;
+    #[link_name = "llvm.experimental.vector.reduce.or.u32.v2u32"]
+    fn reduce_or_u32x2(x: u32x2) -> u32;
+    #[link_name = "llvm.experimental.vector.reduce.or.i64.v2i64"]
+    fn reduce_or_i64x2(x: i64x2) -> i64;
+    #[link_name = "llvm.experimental.vector.reduce.or.u64.v2u64"]
+    fn reduce_or_u64x2(x: u64x2) -> u64;
+    #[link_name = "llvm.experimental.vector.reduce.or.i8.v4i8"]
+    fn reduce_or_i8x4(x: i8x4) -> i8;
+    #[link_name = "llvm.experimental.vector.reduce.or.u8.v4u8"]
+    fn reduce_or_u8x4(x: u8x4) -> u8;
+    #[link_name = "llvm.experimental.vector.reduce.or.i16.v4i16"]
+    fn reduce_or_i16x4(x: i16x4) -> i16;
+    #[link_name = "llvm.experimental.vector.reduce.or.u16.v4u16"]
+    fn reduce_or_u16x4(x: u16x4) -> u16;
+    #[link_name = "llvm.experimental.vector.reduce.or.i32.v4i32"]
+    fn reduce_or_i32x4(x: i32x4) -> i32;
+    #[link_name = "llvm.experimental.vector.reduce.or.u32.v4u32"]
+    fn reduce_or_u32x4(x: u32x4) -> u32;
+    #[link_name = "llvm.experimental.vector.reduce.or.i64.v4i64"]
+    fn reduce_or_i64x4(x: i64x4) -> i64;
+    #[link_name = "llvm.experimental.vector.reduce.or.u64.v4u64"]
+    fn reduce_or_u64x4(x: u64x4) -> u64;
+    #[link_name = "llvm.experimental.vector.reduce.or.i8.v8i8"]
+    fn reduce_or_i8x8(x: i8x8) -> i8;
+    #[link_name = "llvm.experimental.vector.reduce.or.u8.v8u8"]
+    fn reduce_or_u8x8(x: u8x8) -> u8;
+    #[link_name = "llvm.experimental.vector.reduce.or.i16.v8i16"]
+    fn reduce_or_i16x8(x: i16x8) -> i16;
+    #[link_name = "llvm.experimental.vector.reduce.or.u16.v8u16"]
+    fn reduce_or_u16x8(x: u16x8) -> u16;
+    #[link_name = "llvm.experimental.vector.reduce.or.i32.v8i32"]
+    fn reduce_or_i32x8(x: i32x8) -> i32;
+    #[link_name = "llvm.experimental.vector.reduce.or.u32.v8u32"]
+    fn reduce_or_u32x8(x: u32x8) -> u32;
+    #[link_name = "llvm.experimental.vector.reduce.or.i64.v8i64"]
+    fn reduce_or_i64x8(x: i64x8) -> i64;
+    #[link_name = "llvm.experimental.vector.reduce.or.u64.v8u64"]
+    fn reduce_or_u64x8(x: u64x8) -> u64;
+    #[link_name = "llvm.experimental.vector.reduce.or.i8.v16i8"]
+    fn reduce_or_i8x16(x: i8x16) -> i8;
+    #[link_name = "llvm.experimental.vector.reduce.or.u8.v16u8"]
+    fn reduce_or_u8x16(x: u8x16) -> u8;
+    #[link_name = "llvm.experimental.vector.reduce.or.i16.v16i16"]
+    fn reduce_or_i16x16(x: i16x16) -> i16;
+    #[link_name = "llvm.experimental.vector.reduce.or.u16.v16u16"]
+    fn reduce_or_u16x16(x: u16x16) -> u16;
+    #[link_name = "llvm.experimental.vector.reduce.or.i32.v16i32"]
+    fn reduce_or_i32x16(x: i32x16) -> i32;
+    #[link_name = "llvm.experimental.vector.reduce.or.u32.v16u32"]
+    fn reduce_or_u32x16(x: u32x16) -> u32;
+    #[link_name = "llvm.experimental.vector.reduce.or.i8.v32i8"]
+    fn reduce_or_i8x32(x: i8x32) -> i8;
+    #[link_name = "llvm.experimental.vector.reduce.or.u8.v32u8"]
+    fn reduce_or_u8x32(x: u8x32) -> u8;
+    #[link_name = "llvm.experimental.vector.reduce.or.i16.v32i16"]
+    fn reduce_or_i16x32(x: i16x32) -> i16;
+    #[link_name = "llvm.experimental.vector.reduce.or.u16.v32u16"]
+    fn reduce_or_u16x32(x: u16x32) -> u16;
+    #[link_name = "llvm.experimental.vector.reduce.or.i8.v64i8"]
+    fn reduce_or_i8x64(x: i8x64) -> i8;
+    #[link_name = "llvm.experimental.vector.reduce.or.u8.v64u8"]
+    fn reduce_or_u8x64(x: u8x64) -> u8;
+}
+
+/// Reduction: horizontal bitwise or of the vector elements.
+#[cfg_attr(feature = "cargo-clippy", allow(stutter))]
+pub trait ReduceOr {
+    /// Result of the reduction.
+    type Acc;
+    /// Computes the horizontal bitwise or of the vector elements.
+    fn reduce_or(self) -> Self::Acc;
+}
+
+macro_rules! red_or {
+    ($id:ident, $elem_ty:ident, $llvm_intr:ident) => {
+        impl ReduceOr for $id {
+            type Acc = $elem_ty;
+            #[cfg(not(target_arch = "aarch64"))]
+            #[inline]
+            fn reduce_or(self) -> Self::Acc {
+                unsafe { $llvm_intr(self.into_bits()) }
+            }
+            // FIXME: broken in AArch64
+            #[cfg(target_arch = "aarch64")]
+            #[inline]
+            fn reduce_or(self) -> Self::Acc {
+                let mut x = self.extract(0) as Self::Acc;
+                for i in 1..$id::lanes() {
+                    x |= self.extract(i) as Self::Acc;
+                }
+                x
+            }
+        }
+    };
+}
+red_or!(i8x2, i8, reduce_or_i8x2);
+red_or!(u8x2, u8, reduce_or_u8x2);
+red_or!(i16x2, i16, reduce_or_i16x2);
+red_or!(u16x2, u16, reduce_or_u16x2);
+red_or!(i32x2, i32, reduce_or_i32x2);
+red_or!(u32x2, u32, reduce_or_u32x2);
+red_or!(i64x2, i64, reduce_or_i64x2);
+red_or!(u64x2, u64, reduce_or_u64x2);
+red_or!(i8x4, i8, reduce_or_i8x4);
+red_or!(u8x4, u8, reduce_or_u8x4);
+red_or!(i16x4, i16, reduce_or_i16x4);
+red_or!(u16x4, u16, reduce_or_u16x4);
+red_or!(i32x4, i32, reduce_or_i32x4);
+red_or!(u32x4, u32, reduce_or_u32x4);
+red_or!(i64x4, i64, reduce_or_i64x4);
+red_or!(u64x4, u64, reduce_or_u64x4);
+red_or!(i8x8, i8, reduce_or_i8x8);
+red_or!(u8x8, u8, reduce_or_u8x8);
+red_or!(i16x8, i16, reduce_or_i16x8);
+red_or!(u16x8, u16, reduce_or_u16x8);
+red_or!(i32x8, i32, reduce_or_i32x8);
+red_or!(u32x8, u32, reduce_or_u32x8);
+red_or!(i64x8, i64, reduce_or_i64x8);
+red_or!(u64x8, u64, reduce_or_u64x8);
+red_or!(i8x16, i8, reduce_or_i8x16);
+red_or!(u8x16, u8, reduce_or_u8x16);
+red_or!(i16x16, i16, reduce_or_i16x16);
+red_or!(u16x16, u16, reduce_or_u16x16);
+red_or!(i32x16, i32, reduce_or_i32x16);
+red_or!(u32x16, u32, reduce_or_u32x16);
+red_or!(i8x32, i8, reduce_or_i8x32);
+red_or!(u8x32, u8, reduce_or_u8x32);
+red_or!(i16x32, i16, reduce_or_i16x32);
+red_or!(u16x32, u16, reduce_or_u16x32);
+red_or!(i8x64, i8, reduce_or_i8x64);
+red_or!(u8x64, u8, reduce_or_u8x64);
+
+red_or!(b8x2, i8, reduce_or_i8x2);
+red_or!(b8x4, i8, reduce_or_i8x4);
+red_or!(b8x8, i8, reduce_or_i8x8);
+red_or!(b8x16, i8, reduce_or_i8x16);
+red_or!(b8x32, i8, reduce_or_i8x32);
+red_or!(b8x64, i8, reduce_or_i8x64);
+
+#[cfg(test)]
+mod tests {
+    use super::ReduceOr;
+    use ::coresimd::simd::*;
+
+    // note: these are tested in the portable vector API tests
+
+    #[test]
+    fn reduce_or_i32x4() {
+        let v = i32x4::splat(1);
+        assert_eq!(v.reduce_or(), 1_i32);
+        let v = i32x4::new(1, 1, 0, 1);
+        assert_eq!(v.reduce_or(), 1_i32);
+    }
+}
diff --git a/coresimd/ppsv/codegen/product.rs b/coresimd/ppsv/codegen/product.rs
new file mode 100644
index 0000000000..725ee884c5
--- /dev/null
+++ b/coresimd/ppsv/codegen/product.rs
@@ -0,0 +1,210 @@
+//! Code generation for the product reduction.
+use ::coresimd::simd::*;
+
+/// LLVM intrinsics used in the product reduction
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.experimental.vector.reduce.mul.i8.v2i8"]
+    fn reduce_mul_i8x2(x: i8x2) -> i8;
+    #[link_name = "llvm.experimental.vector.reduce.mul.u8.v2u8"]
+    fn reduce_mul_u8x2(x: u8x2) -> u8;
+    #[link_name = "llvm.experimental.vector.reduce.mul.i16.v2i16"]
+    fn reduce_mul_i16x2(x: i16x2) -> i16;
+    #[link_name = "llvm.experimental.vector.reduce.mul.u16.v2u16"]
+    fn reduce_mul_u16x2(x: u16x2) -> u16;
+    #[link_name = "llvm.experimental.vector.reduce.mul.i32.v2i32"]
+    fn reduce_mul_i32x2(x: i32x2) -> i32;
+    #[link_name = "llvm.experimental.vector.reduce.mul.u32.v2u32"]
+    fn reduce_mul_u32x2(x: u32x2) -> u32;
+    #[link_name = "llvm.experimental.vector.reduce.mul.i64.v2i64"]
+    fn reduce_mul_i64x2(x: i64x2) -> i64;
+    #[link_name = "llvm.experimental.vector.reduce.mul.u64.v2u64"]
+    fn reduce_mul_u64x2(x: u64x2) -> u64;
+    #[link_name = "llvm.experimental.vector.reduce.mul.i8.v4i8"]
+    fn reduce_mul_i8x4(x: i8x4) -> i8;
+    #[link_name = "llvm.experimental.vector.reduce.mul.u8.v4u8"]
+    fn reduce_mul_u8x4(x: u8x4) -> u8;
+    #[link_name = "llvm.experimental.vector.reduce.mul.i16.v4i16"]
+    fn reduce_mul_i16x4(x: i16x4) -> i16;
+    #[link_name = "llvm.experimental.vector.reduce.mul.u16.v4u16"]
+    fn reduce_mul_u16x4(x: u16x4) -> u16;
+    #[link_name = "llvm.experimental.vector.reduce.mul.i32.v4i32"]
+    fn reduce_mul_i32x4(x: i32x4) -> i32;
+    #[link_name = "llvm.experimental.vector.reduce.mul.u32.v4u32"]
+    fn reduce_mul_u32x4(x: u32x4) -> u32;
+    #[link_name = "llvm.experimental.vector.reduce.mul.i64.v4i64"]
+    fn reduce_mul_i64x4(x: i64x4) -> i64;
+    #[link_name = "llvm.experimental.vector.reduce.mul.u64.v4u64"]
+    fn reduce_mul_u64x4(x: u64x4) -> u64;
+    #[link_name = "llvm.experimental.vector.reduce.mul.i8.v8i8"]
+    fn reduce_mul_i8x8(x: i8x8) -> i8;
+    #[link_name = "llvm.experimental.vector.reduce.mul.u8.v8u8"]
+    fn reduce_mul_u8x8(x: u8x8) -> u8;
+    #[link_name = "llvm.experimental.vector.reduce.mul.i16.v8i16"]
+    fn reduce_mul_i16x8(x: i16x8) -> i16;
+    #[link_name = "llvm.experimental.vector.reduce.mul.u16.v8u16"]
+    fn reduce_mul_u16x8(x: u16x8) -> u16;
+    #[link_name = "llvm.experimental.vector.reduce.mul.i32.v8i32"]
+    fn reduce_mul_i32x8(x: i32x8) -> i32;
+    #[link_name = "llvm.experimental.vector.reduce.mul.u32.v8u32"]
+    fn reduce_mul_u32x8(x: u32x8) -> u32;
+    #[link_name = "llvm.experimental.vector.reduce.mul.i64.v8i64"]
+    fn reduce_mul_i64x8(x: i64x8) -> i64;
+    #[link_name = "llvm.experimental.vector.reduce.mul.u64.v8u64"]
+    fn reduce_mul_u64x8(x: u64x8) -> u64;
+    #[link_name = "llvm.experimental.vector.reduce.mul.i8.v16i8"]
+    fn reduce_mul_i8x16(x: i8x16) -> i8;
+    #[link_name = "llvm.experimental.vector.reduce.mul.u8.v16u8"]
+    fn reduce_mul_u8x16(x: u8x16) -> u8;
+    #[link_name = "llvm.experimental.vector.reduce.mul.i16.v16i16"]
+    fn reduce_mul_i16x16(x: i16x16) -> i16;
+    #[link_name = "llvm.experimental.vector.reduce.mul.u16.v16u16"]
+    fn reduce_mul_u16x16(x: u16x16) -> u16;
+    #[link_name = "llvm.experimental.vector.reduce.mul.i32.v16i32"]
+    fn reduce_mul_i32x16(x: i32x16) -> i32;
+    #[link_name = "llvm.experimental.vector.reduce.mul.u32.v16u32"]
+    fn reduce_mul_u32x16(x: u32x16) -> u32;
+    #[link_name = "llvm.experimental.vector.reduce.mul.i8.v32i8"]
+    fn reduce_mul_i8x32(x: i8x32) -> i8;
+    #[link_name = "llvm.experimental.vector.reduce.mul.u8.v32u8"]
+    fn reduce_mul_u8x32(x: u8x32) -> u8;
+    #[link_name = "llvm.experimental.vector.reduce.mul.i16.v32i16"]
+    fn reduce_mul_i16x32(x: i16x32) -> i16;
+    #[link_name = "llvm.experimental.vector.reduce.mul.u16.v32u16"]
+    fn reduce_mul_u16x32(x: u16x32) -> u16;
+    #[link_name = "llvm.experimental.vector.reduce.mul.i8.v64i8"]
+    fn reduce_mul_i8x64(x: i8x64) -> i8;
+    #[link_name = "llvm.experimental.vector.reduce.mul.u8.v64u8"]
+    fn reduce_mul_u8x64(x: u8x64) -> u8;
+    #[link_name = "llvm.experimental.vector.reduce.fmul.f32.v2f32"]
+    fn reduce_fmul_f32x2(acc: f32, x: f32x2) -> f32;
+    #[link_name = "llvm.experimental.vector.reduce.fmul.f64.v2f64"]
+    fn reduce_fmul_f64x2(acc: f64, x: f64x2) -> f64;
+    #[link_name = "llvm.experimental.vector.reduce.fmul.f32.v4f32"]
+    fn reduce_fmul_f32x4(acc: f32, x: f32x4) -> f32;
+    #[link_name = "llvm.experimental.vector.reduce.fmul.f64.v4f64"]
+    fn reduce_fmul_f64x4(acc: f64, x: f64x4) -> f64;
+    #[link_name = "llvm.experimental.vector.reduce.fmul.f32.v8f32"]
+    fn reduce_fmul_f32x8(acc: f32, x: f32x8) -> f32;
+    #[link_name = "llvm.experimental.vector.reduce.fmul.f64.v8f64"]
+    fn reduce_fmul_f64x8(acc: f64, x: f64x8) -> f64;
+    #[link_name = "llvm.experimental.vector.reduce.fmul.f32.v16f32"]
+    fn reduce_fmul_f32x16(acc: f32, x: f32x16) -> f32;
+}
+
+/// Reduction: horizontal product of the vector elements.
+pub trait ReduceMul {
+    /// Result type of the reduction.
+    type Acc;
+    /// Computes the horizontal product of the vector elements.
+    fn reduce_mul(self) -> Self::Acc;
+}
+
+macro_rules! red_mul {
+    ($id:ident, $elem_ty:ident, $llvm_intr:ident) => {
+        impl ReduceMul for $id {
+            type Acc = $elem_ty;
+            #[cfg(not(target_arch = "aarch64"))]
+            #[inline]
+            fn reduce_mul(self) -> Self::Acc {
+                unsafe { $llvm_intr(self) }
+            }
+            // FIXME: broken in AArch64
+            #[cfg(target_arch = "aarch64")]
+            #[inline]
+            fn reduce_mul(self) -> Self::Acc {
+                let mut x = self.extract(0);
+                for i in 1..$id::lanes() {
+                    x *= self.extract(i);
+                }
+                x
+            }
+        }
+    };
+}
+red_mul!(i8x2, i8, reduce_mul_i8x2);
+red_mul!(u8x2, u8, reduce_mul_u8x2);
+red_mul!(i16x2, i16, reduce_mul_i16x2);
+red_mul!(u16x2, u16, reduce_mul_u16x2);
+red_mul!(i32x2, i32, reduce_mul_i32x2);
+red_mul!(u32x2, u32, reduce_mul_u32x2);
+red_mul!(i64x2, i64, reduce_mul_i64x2);
+red_mul!(u64x2, u64, reduce_mul_u64x2);
+red_mul!(i8x4, i8, reduce_mul_i8x4);
+red_mul!(u8x4, u8, reduce_mul_u8x4);
+red_mul!(i16x4, i16, reduce_mul_i16x4);
+red_mul!(u16x4, u16, reduce_mul_u16x4);
+red_mul!(i32x4, i32, reduce_mul_i32x4);
+red_mul!(u32x4, u32, reduce_mul_u32x4);
+red_mul!(i64x4, i64, reduce_mul_i64x4);
+red_mul!(u64x4, u64, reduce_mul_u64x4);
+red_mul!(i8x8, i8, reduce_mul_i8x8);
+red_mul!(u8x8, u8, reduce_mul_u8x8);
+red_mul!(i16x8, i16, reduce_mul_i16x8);
+red_mul!(u16x8, u16, reduce_mul_u16x8);
+red_mul!(i32x8, i32, reduce_mul_i32x8);
+red_mul!(u32x8, u32, reduce_mul_u32x8);
+red_mul!(i64x8, i64, reduce_mul_i64x8);
+red_mul!(u64x8, u64, reduce_mul_u64x8);
+red_mul!(i8x16, i8, reduce_mul_i8x16);
+red_mul!(u8x16, u8, reduce_mul_u8x16);
+red_mul!(i16x16, i16, reduce_mul_i16x16);
+red_mul!(u16x16, u16, reduce_mul_u16x16);
+red_mul!(i32x16, i32, reduce_mul_i32x16);
+red_mul!(u32x16, u32, reduce_mul_u32x16);
+red_mul!(i8x32, i8, reduce_mul_i8x32);
+red_mul!(u8x32, u8, reduce_mul_u8x32);
+red_mul!(i16x32, i16, reduce_mul_i16x32);
+red_mul!(u16x32, u16, reduce_mul_u16x32);
+red_mul!(i8x64, i8, reduce_mul_i8x64);
+red_mul!(u8x64, u8, reduce_mul_u8x64);
+
+macro_rules! red_fmul {
+    ($id:ident, $elem_ty:ident, $llvm_intr:ident) => {
+        impl ReduceMul for $id {
+            type Acc = $elem_ty;
+            #[inline]
+            fn reduce_mul(self) -> Self::Acc {
+                // FIXME:
+                // unsafe { $llvm_intr(1. as $elem_ty, self) }
+                let mut x = self.extract(0);
+                for i in 1..$id::lanes() {
+                    x *= self.extract(i);
+                }
+                x
+            }
+        }
+    };
+}
+
+red_fmul!(f32x2, f32, reduce_fmul_f32x2);
+red_fmul!(f64x2, f64, reduce_fmul_f64x2);
+red_fmul!(f32x4, f32, reduce_fmul_f32x4);
+red_fmul!(f64x4, f64, reduce_fmul_f64x4);
+red_fmul!(f32x8, f32, reduce_fmul_f32x8);
+red_fmul!(f64x8, f64, reduce_fmul_f64x8);
+red_fmul!(f32x16, f32, reduce_fmul_f32x16);
+
+#[cfg(test)]
+mod tests {
+    use super::ReduceMul;
+    use ::coresimd::simd::*;
+
+    // note: these are tested in the portable vector API tests
+
+    #[test]
+    fn reduce_mul_i32x4() {
+        let v = i32x4::splat(2);
+        assert_eq!(v.reduce_mul(), 16_i32);
+    }
+    #[test]
+    fn reduce_mul_u32x4() {
+        let v = u32x4::splat(2);
+        assert_eq!(v.reduce_mul(), 16_u32);
+    }
+    #[test]
+    fn reduce_mul_f32x4() {
+        let v = f32x4::splat(2.);
+        assert_eq!(v.reduce_mul(), 16.);
+    }
+}
diff --git a/coresimd/ppsv/codegen/sum.rs b/coresimd/ppsv/codegen/sum.rs
new file mode 100644
index 0000000000..79b1ba0849
--- /dev/null
+++ b/coresimd/ppsv/codegen/sum.rs
@@ -0,0 +1,210 @@
+//! Code generation for the sum reduction.
+use ::coresimd::simd::*;
+
+/// LLVM intrinsics used in the sum reduction
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.experimental.vector.reduce.add.i8.v2i8"]
+    fn reduce_add_i8x2(x: i8x2) -> i8;
+    #[link_name = "llvm.experimental.vector.reduce.add.u8.v2u8"]
+    fn reduce_add_u8x2(x: u8x2) -> u8;
+    #[link_name = "llvm.experimental.vector.reduce.add.i16.v2i16"]
+    fn reduce_add_i16x2(x: i16x2) -> i16;
+    #[link_name = "llvm.experimental.vector.reduce.add.u16.v2u16"]
+    fn reduce_add_u16x2(x: u16x2) -> u16;
+    #[link_name = "llvm.experimental.vector.reduce.add.i32.v2i32"]
+    fn reduce_add_i32x2(x: i32x2) -> i32;
+    #[link_name = "llvm.experimental.vector.reduce.add.u32.v2u32"]
+    fn reduce_add_u32x2(x: u32x2) -> u32;
+    #[link_name = "llvm.experimental.vector.reduce.add.i64.v2i64"]
+    fn reduce_add_i64x2(x: i64x2) -> i64;
+    #[link_name = "llvm.experimental.vector.reduce.add.u64.v2u64"]
+    fn reduce_add_u64x2(x: u64x2) -> u64;
+    #[link_name = "llvm.experimental.vector.reduce.add.i8.v4i8"]
+    fn reduce_add_i8x4(x: i8x4) -> i8;
+    #[link_name = "llvm.experimental.vector.reduce.add.u8.v4u8"]
+    fn reduce_add_u8x4(x: u8x4) -> u8;
+    #[link_name = "llvm.experimental.vector.reduce.add.i16.v4i16"]
+    fn reduce_add_i16x4(x: i16x4) -> i16;
+    #[link_name = "llvm.experimental.vector.reduce.add.u16.v4u16"]
+    fn reduce_add_u16x4(x: u16x4) -> u16;
+    #[link_name = "llvm.experimental.vector.reduce.add.i32.v4i32"]
+    fn reduce_add_i32x4(x: i32x4) -> i32;
+    #[link_name = "llvm.experimental.vector.reduce.add.u32.v4u32"]
+    fn reduce_add_u32x4(x: u32x4) -> u32;
+    #[link_name = "llvm.experimental.vector.reduce.add.i64.v4i64"]
+    fn reduce_add_i64x4(x: i64x4) -> i64;
+    #[link_name = "llvm.experimental.vector.reduce.add.u64.v4u64"]
+    fn reduce_add_u64x4(x: u64x4) -> u64;
+    #[link_name = "llvm.experimental.vector.reduce.add.i8.v8i8"]
+    fn reduce_add_i8x8(x: i8x8) -> i8;
+    #[link_name = "llvm.experimental.vector.reduce.add.u8.v8u8"]
+    fn reduce_add_u8x8(x: u8x8) -> u8;
+    #[link_name = "llvm.experimental.vector.reduce.add.i16.v8i16"]
+    fn reduce_add_i16x8(x: i16x8) -> i16;
+    #[link_name = "llvm.experimental.vector.reduce.add.u16.v8u16"]
+    fn reduce_add_u16x8(x: u16x8) -> u16;
+    #[link_name = "llvm.experimental.vector.reduce.add.i32.v8i32"]
+    fn reduce_add_i32x8(x: i32x8) -> i32;
+    #[link_name = "llvm.experimental.vector.reduce.add.u32.v8u32"]
+    fn reduce_add_u32x8(x: u32x8) -> u32;
+    #[link_name = "llvm.experimental.vector.reduce.add.i64.v8i64"]
+    fn reduce_add_i64x8(x: i64x8) -> i64;
+    #[link_name = "llvm.experimental.vector.reduce.add.u64.v8u64"]
+    fn reduce_add_u64x8(x: u64x8) -> u64;
+    #[link_name = "llvm.experimental.vector.reduce.add.i8.v16i8"]
+    fn reduce_add_i8x16(x: i8x16) -> i8;
+    #[link_name = "llvm.experimental.vector.reduce.add.u8.v16u8"]
+    fn reduce_add_u8x16(x: u8x16) -> u8;
+    #[link_name = "llvm.experimental.vector.reduce.add.i16.v16i16"]
+    fn reduce_add_i16x16(x: i16x16) -> i16;
+    #[link_name = "llvm.experimental.vector.reduce.add.u16.v16u16"]
+    fn reduce_add_u16x16(x: u16x16) -> u16;
+    #[link_name = "llvm.experimental.vector.reduce.add.i32.v16i32"]
+    fn reduce_add_i32x16(x: i32x16) -> i32;
+    #[link_name = "llvm.experimental.vector.reduce.add.u32.v16u32"]
+    fn reduce_add_u32x16(x: u32x16) -> u32;
+    #[link_name = "llvm.experimental.vector.reduce.add.i8.v32i8"]
+    fn reduce_add_i8x32(x: i8x32) -> i8;
+    #[link_name = "llvm.experimental.vector.reduce.add.u8.v32u8"]
+    fn reduce_add_u8x32(x: u8x32) -> u8;
+    #[link_name = "llvm.experimental.vector.reduce.add.i16.v32i16"]
+    fn reduce_add_i16x32(x: i16x32) -> i16;
+    #[link_name = "llvm.experimental.vector.reduce.add.u16.v32u16"]
+    fn reduce_add_u16x32(x: u16x32) -> u16;
+    #[link_name = "llvm.experimental.vector.reduce.add.i8.v64i8"]
+    fn reduce_add_i8x64(x: i8x64) -> i8;
+    #[link_name = "llvm.experimental.vector.reduce.add.u8.v64u8"]
+    fn reduce_add_u8x64(x: u8x64) -> u8;
+    #[link_name = "llvm.experimental.vector.reduce.fadd.f32.v2f32"]
+    fn reduce_fadd_f32x2(acc: f32, x: f32x2) -> f32;
+    #[link_name = "llvm.experimental.vector.reduce.fadd.f64.v2f64"]
+    fn reduce_fadd_f64x2(acc: f64, x: f64x2) -> f64;
+    #[link_name = "llvm.experimental.vector.reduce.fadd.f32.v4f32"]
+    fn reduce_fadd_f32x4(acc: f32, x: f32x4) -> f32;
+    #[link_name = "llvm.experimental.vector.reduce.fadd.f64.v4f64"]
+    fn reduce_fadd_f64x4(acc: f64, x: f64x4) -> f64;
+    #[link_name = "llvm.experimental.vector.reduce.fadd.f32.v8f32"]
+    fn reduce_fadd_f32x8(acc: f32, x: f32x8) -> f32;
+    #[link_name = "llvm.experimental.vector.reduce.fadd.f64.v8f64"]
+    fn reduce_fadd_f64x8(acc: f64, x: f64x8) -> f64;
+    #[link_name = "llvm.experimental.vector.reduce.fadd.f32.v16f32"]
+    fn reduce_fadd_f32x16(acc: f32, x: f32x16) -> f32;
+}
+
+/// Reduction: horizontal sum of the vector elements.
+pub trait ReduceAdd {
+    /// Result type of the reduction.
+    type Acc;
+    /// Computes the horizontal sum of the vector elements.
+    fn reduce_add(self) -> Self::Acc;
+}
+
+macro_rules! red_add {
+    ($id:ident, $elem_ty:ident, $llvm_intr:ident) => {
+        impl ReduceAdd for $id {
+            type Acc = $elem_ty;
+            #[cfg(not(target_arch = "aarch64"))]
+            #[inline]
+            fn reduce_add(self) -> Self::Acc {
+                unsafe { $llvm_intr(self) }
+            }
+            // FIXME: broken in AArch64
+            #[cfg(target_arch = "aarch64")]
+            #[inline]
+            fn reduce_add(self) -> Self::Acc {
+                let mut x = self.extract(0) as Self::Acc;
+                for i in 1..$id::lanes() {
+                    x += self.extract(i) as Self::Acc;
+                }
+                x
+            }
+        }
+    };
+}
+red_add!(i8x2, i8, reduce_add_i8x2);
+red_add!(u8x2, u8, reduce_add_u8x2);
+red_add!(i16x2, i16, reduce_add_i16x2);
+red_add!(u16x2, u16, reduce_add_u16x2);
+red_add!(i32x2, i32, reduce_add_i32x2);
+red_add!(u32x2, u32, reduce_add_u32x2);
+red_add!(i64x2, i64, reduce_add_i64x2);
+red_add!(u64x2, u64, reduce_add_u64x2);
+red_add!(i8x4, i8, reduce_add_i8x4);
+red_add!(u8x4, u8, reduce_add_u8x4);
+red_add!(i16x4, i16, reduce_add_i16x4);
+red_add!(u16x4, u16, reduce_add_u16x4);
+red_add!(i32x4, i32, reduce_add_i32x4);
+red_add!(u32x4, u32, reduce_add_u32x4);
+red_add!(i64x4, i64, reduce_add_i64x4);
+red_add!(u64x4, u64, reduce_add_u64x4);
+red_add!(i8x8, i8, reduce_add_i8x8);
+red_add!(u8x8, u8, reduce_add_u8x8);
+red_add!(i16x8, i16, reduce_add_i16x8);
+red_add!(u16x8, u16, reduce_add_u16x8);
+red_add!(i32x8, i32, reduce_add_i32x8);
+red_add!(u32x8, u32, reduce_add_u32x8);
+red_add!(i64x8, i64, reduce_add_i64x8);
+red_add!(u64x8, u64, reduce_add_u64x8);
+red_add!(i8x16, i8, reduce_add_i8x16);
+red_add!(u8x16, u8, reduce_add_u8x16);
+red_add!(i16x16, i16, reduce_add_i16x16);
+red_add!(u16x16, u16, reduce_add_u16x16);
+red_add!(i32x16, i32, reduce_add_i32x16);
+red_add!(u32x16, u32, reduce_add_u32x16);
+red_add!(i8x32, i8, reduce_add_i8x32);
+red_add!(u8x32, u8, reduce_add_u8x32);
+red_add!(i16x32, i16, reduce_add_i16x32);
+red_add!(u16x32, u16, reduce_add_u16x32);
+red_add!(i8x64, i8, reduce_add_i8x64);
+red_add!(u8x64, u8, reduce_add_u8x64);
+
+macro_rules! red_fadd {
+    ($id:ident, $elem_ty:ident, $llvm_intr:ident) => {
+        impl ReduceAdd for $id {
+            type Acc = $elem_ty;
+            #[inline]
+            fn reduce_add(self) -> Self::Acc {
+                // FIXME:
+                //unsafe { $llvm_intr(0. as $elem_ty, self) }
+                let mut x = self.extract(0);
+                for i in 1..$id::lanes() {
+                    x += self.extract(i);
+                }
+                x
+            }
+        }
+    };
+}
+
+red_fadd!(f32x2, f32, reduce_fadd_f32x2);
+red_fadd!(f64x2, f64, reduce_fadd_f64x2);
+red_fadd!(f32x4, f32, reduce_fadd_f32x4);
+red_fadd!(f64x4, f64, reduce_fadd_f64x4);
+red_fadd!(f32x8, f32, reduce_fadd_f32x8);
+red_fadd!(f64x8, f64, reduce_fadd_f64x8);
+red_fadd!(f32x16, f32, reduce_fadd_f32x16);
+
+#[cfg(test)]
+mod tests {
+    use super::ReduceAdd;
+    use ::coresimd::simd::*;
+
+    // note: these are tested in the portable vector API tests
+
+    #[test]
+    fn reduce_add_i32x4() {
+        let v = i32x4::splat(1);
+        assert_eq!(v.reduce_add(), 4_i32);
+    }
+    #[test]
+    fn reduce_add_u32x4() {
+        let v = u32x4::splat(1);
+        assert_eq!(v.reduce_add(), 4_u32);
+    }
+    #[test]
+    fn reduce_add_f32x4() {
+        let v = f32x4::splat(1.);
+        assert_eq!(v.reduce_add(), 4.);
+    }
+}
diff --git a/coresimd/ppsv/codegen/xor.rs b/coresimd/ppsv/codegen/xor.rs
new file mode 100644
index 0000000000..cf5ef2bece
--- /dev/null
+++ b/coresimd/ppsv/codegen/xor.rs
@@ -0,0 +1,170 @@
+//! Code generation for the xor reduction.
+use ::coresimd::simd::*;
+
+/// LLVM intrinsics used in the xor reduction
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.experimental.vector.reduce.xor.i8.v2i8"]
+    fn reduce_xor_i8x2(x: i8x2) -> i8;
+    #[link_name = "llvm.experimental.vector.reduce.xor.u8.v2u8"]
+    fn reduce_xor_u8x2(x: u8x2) -> u8;
+    #[link_name = "llvm.experimental.vector.reduce.xor.i16.v2i16"]
+    fn reduce_xor_i16x2(x: i16x2) -> i16;
+    #[link_name = "llvm.experimental.vector.reduce.xor.u16.v2u16"]
+    fn reduce_xor_u16x2(x: u16x2) -> u16;
+    #[link_name = "llvm.experimental.vector.reduce.xor.i32.v2i32"]
+    fn reduce_xor_i32x2(x: i32x2) -> i32;
+    #[link_name = "llvm.experimental.vector.reduce.xor.u32.v2u32"]
+    fn reduce_xor_u32x2(x: u32x2) -> u32;
+    #[link_name = "llvm.experimental.vector.reduce.xor.i64.v2i64"]
+    fn reduce_xor_i64x2(x: i64x2) -> i64;
+    #[link_name = "llvm.experimental.vector.reduce.xor.u64.v2u64"]
+    fn reduce_xor_u64x2(x: u64x2) -> u64;
+    #[link_name = "llvm.experimental.vector.reduce.xor.i8.v4i8"]
+    fn reduce_xor_i8x4(x: i8x4) -> i8;
+    #[link_name = "llvm.experimental.vector.reduce.xor.u8.v4u8"]
+    fn reduce_xor_u8x4(x: u8x4) -> u8;
+    #[link_name = "llvm.experimental.vector.reduce.xor.i16.v4i16"]
+    fn reduce_xor_i16x4(x: i16x4) -> i16;
+    #[link_name = "llvm.experimental.vector.reduce.xor.u16.v4u16"]
+    fn reduce_xor_u16x4(x: u16x4) -> u16;
+    #[link_name = "llvm.experimental.vector.reduce.xor.i32.v4i32"]
+    fn reduce_xor_i32x4(x: i32x4) -> i32;
+    #[link_name = "llvm.experimental.vector.reduce.xor.u32.v4u32"]
+    fn reduce_xor_u32x4(x: u32x4) -> u32;
+    #[link_name = "llvm.experimental.vector.reduce.xor.i64.v4i64"]
+    fn reduce_xor_i64x4(x: i64x4) -> i64;
+    #[link_name = "llvm.experimental.vector.reduce.xor.u64.v4u64"]
+    fn reduce_xor_u64x4(x: u64x4) -> u64;
+    #[link_name = "llvm.experimental.vector.reduce.xor.i8.v8i8"]
+    fn reduce_xor_i8x8(x: i8x8) -> i8;
+    #[link_name = "llvm.experimental.vector.reduce.xor.u8.v8u8"]
+    fn reduce_xor_u8x8(x: u8x8) -> u8;
+    #[link_name = "llvm.experimental.vector.reduce.xor.i16.v8i16"]
+    fn reduce_xor_i16x8(x: i16x8) -> i16;
+    #[link_name = "llvm.experimental.vector.reduce.xor.u16.v8u16"]
+    fn reduce_xor_u16x8(x: u16x8) -> u16;
+    #[link_name = "llvm.experimental.vector.reduce.xor.i32.v8i32"]
+    fn reduce_xor_i32x8(x: i32x8) -> i32;
+    #[link_name = "llvm.experimental.vector.reduce.xor.u32.v8u32"]
+    fn reduce_xor_u32x8(x: u32x8) -> u32;
+    #[link_name = "llvm.experimental.vector.reduce.xor.i64.v8i64"]
+    fn reduce_xor_i64x8(x: i64x8) -> i64;
+    #[link_name = "llvm.experimental.vector.reduce.xor.u64.v8u64"]
+    fn reduce_xor_u64x8(x: u64x8) -> u64;
+    #[link_name = "llvm.experimental.vector.reduce.xor.i8.v16i8"]
+    fn reduce_xor_i8x16(x: i8x16) -> i8;
+    #[link_name = "llvm.experimental.vector.reduce.xor.u8.v16u8"]
+    fn reduce_xor_u8x16(x: u8x16) -> u8;
+    #[link_name = "llvm.experimental.vector.reduce.xor.i16.v16i16"]
+    fn reduce_xor_i16x16(x: i16x16) -> i16;
+    #[link_name = "llvm.experimental.vector.reduce.xor.u16.v16u16"]
+    fn reduce_xor_u16x16(x: u16x16) -> u16;
+    #[link_name = "llvm.experimental.vector.reduce.xor.i32.v16i32"]
+    fn reduce_xor_i32x16(x: i32x16) -> i32;
+    #[link_name = "llvm.experimental.vector.reduce.xor.u32.v16u32"]
+    fn reduce_xor_u32x16(x: u32x16) -> u32;
+    #[link_name = "llvm.experimental.vector.reduce.xor.i8.v32i8"]
+    fn reduce_xor_i8x32(x: i8x32) -> i8;
+    #[link_name = "llvm.experimental.vector.reduce.xor.u8.v32u8"]
+    fn reduce_xor_u8x32(x: u8x32) -> u8;
+    #[link_name = "llvm.experimental.vector.reduce.xor.i16.v32i16"]
+    fn reduce_xor_i16x32(x: i16x32) -> i16;
+    #[link_name = "llvm.experimental.vector.reduce.xor.u16.v32u16"]
+    fn reduce_xor_u16x32(x: u16x32) -> u16;
+    #[link_name = "llvm.experimental.vector.reduce.xor.i8.v64i8"]
+    fn reduce_xor_i8x64(x: i8x64) -> i8;
+    #[link_name = "llvm.experimental.vector.reduce.xor.u8.v64u8"]
+    fn reduce_xor_u8x64(x: u8x64) -> u8;
+}
+
+/// Reduction: horizontal bitwise xor of the vector elements.
+#[cfg_attr(feature = "cargo-clippy", allow(stutter))]
+pub trait ReduceXor {
+    /// Result type of the reduction.
+    type Acc;
+    /// Computes the horizontal bitwise xor of the vector elements.
+    fn reduce_xor(self) -> Self::Acc;
+}
+
+macro_rules! red_xor {
+    ($id:ident, $elem_ty:ident, $llvm_intr:ident) => {
+        impl ReduceXor for $id {
+            type Acc = $elem_ty;
+            #[cfg(not(target_arch = "aarch64"))]
+            #[inline]
+            fn reduce_xor(self) -> Self::Acc {
+                unsafe { $llvm_intr(self.into_bits()) }
+            }
+            // FIXME: broken in AArch64
+            #[cfg(target_arch = "aarch64")]
+            #[inline]
+            fn reduce_xor(self) -> Self::Acc {
+                let mut x = self.extract(0) as Self::Acc;
+                for i in 1..$id::lanes() {
+                    x ^= self.extract(i) as Self::Acc;
+                }
+                x
+            }
+        }
+    };
+}
+red_xor!(i8x2, i8, reduce_xor_i8x2);
+red_xor!(u8x2, u8, reduce_xor_u8x2);
+red_xor!(i16x2, i16, reduce_xor_i16x2);
+red_xor!(u16x2, u16, reduce_xor_u16x2);
+red_xor!(i32x2, i32, reduce_xor_i32x2);
+red_xor!(u32x2, u32, reduce_xor_u32x2);
+red_xor!(i64x2, i64, reduce_xor_i64x2);
+red_xor!(u64x2, u64, reduce_xor_u64x2);
+red_xor!(i8x4, i8, reduce_xor_i8x4);
+red_xor!(u8x4, u8, reduce_xor_u8x4);
+red_xor!(i16x4, i16, reduce_xor_i16x4);
+red_xor!(u16x4, u16, reduce_xor_u16x4);
+red_xor!(i32x4, i32, reduce_xor_i32x4);
+red_xor!(u32x4, u32, reduce_xor_u32x4);
+red_xor!(i64x4, i64, reduce_xor_i64x4);
+red_xor!(u64x4, u64, reduce_xor_u64x4);
+red_xor!(i8x8, i8, reduce_xor_i8x8);
+red_xor!(u8x8, u8, reduce_xor_u8x8);
+red_xor!(i16x8, i16, reduce_xor_i16x8);
+red_xor!(u16x8, u16, reduce_xor_u16x8);
+red_xor!(i32x8, i32, reduce_xor_i32x8);
+red_xor!(u32x8, u32, reduce_xor_u32x8);
+red_xor!(i64x8, i64, reduce_xor_i64x8);
+red_xor!(u64x8, u64, reduce_xor_u64x8);
+red_xor!(i8x16, i8, reduce_xor_i8x16);
+red_xor!(u8x16, u8, reduce_xor_u8x16);
+red_xor!(i16x16, i16, reduce_xor_i16x16);
+red_xor!(u16x16, u16, reduce_xor_u16x16);
+red_xor!(i32x16, i32, reduce_xor_i32x16);
+red_xor!(u32x16, u32, reduce_xor_u32x16);
+red_xor!(i8x32, i8, reduce_xor_i8x32);
+red_xor!(u8x32, u8, reduce_xor_u8x32);
+red_xor!(i16x32, i16, reduce_xor_i16x32);
+red_xor!(u16x32, u16, reduce_xor_u16x32);
+red_xor!(i8x64, i8, reduce_xor_i8x64);
+red_xor!(u8x64, u8, reduce_xor_u8x64);
+
+red_xor!(b8x2, i8, reduce_xor_i8x2);
+red_xor!(b8x4, i8, reduce_xor_i8x4);
+red_xor!(b8x8, i8, reduce_xor_i8x8);
+red_xor!(b8x16, i8, reduce_xor_i8x16);
+red_xor!(b8x32, i8, reduce_xor_i8x32);
+red_xor!(b8x64, i8, reduce_xor_i8x64);
+
+#[cfg(test)]
+mod tests {
+    use super::ReduceXor;
+    use ::coresimd::simd::*;
+
+    // note: these are tested in the portable vector API tests
+
+    #[test]
+    fn reduce_xor_i32x4() {
+        let v = i32x4::splat(1);
+        assert_eq!(v.reduce_xor(), 0_i32);
+        let v = i32x4::new(1, 0, 0, 0);
+        assert_eq!(v.reduce_xor(), 1_i32);
+    }
+}
diff --git a/coresimd/ppsv/mod.rs b/coresimd/ppsv/mod.rs
new file mode 100644
index 0000000000..69de61d906
--- /dev/null
+++ b/coresimd/ppsv/mod.rs
@@ -0,0 +1,82 @@
+//! Portable Packed-SIMD Vectors.
+//!
+//! These types are:
+//!
+//! * portable: work correctly on all architectures,
+//! * packed: have a size fixed at compile-time.
+//!
+//! These two terms are the opposites of:
+//!
+//! * architecture-specific: only available in a particular architecture,
+//! * scalable: the vector's size is dynamic.
+//!
+//! This module is structured as follows:
+//!
+//! * `api`: defines the API of the portable packed vector types.
+//! * `v{width}`: defines the portable vector types for a particular `width`.
+//!
+//! The portable packed vector types are named using the following schema:
+//! `{t}{l_w}x{l_n}`:
+//!
+//! * `t`: type - single letter corresponding to the following Rust literal
+//! types:   * `i`: signed integer
+//!   * `u`: unsigned integer
+//!   * `f`: floating point
+//!   * `b`: boolean
+//! * `l_w`: lane width in bits
+//! * `l_n`: number of lanes
+//!
+//! For example, `f32x4` is a vector type containing four 32-bit wide
+//! floating-point numbers. The total width of this type is 32 bit times 4
+//! lanes, that is, 128 bits, and is thus defined in the `v128` module.
+
+#[macro_use]
+mod api;
+mod codegen;
+
+mod v16;
+mod v32;
+mod v64;
+mod v128;
+mod v256;
+mod v512;
+
+pub use self::v16::*;
+pub use self::v32::*;
+pub use self::v64::*;
+pub use self::v128::*;
+pub use self::v256::*;
+pub use self::v512::*;
+
+use marker;
+
+/// Safe lossless bitwise conversion from `T` to `Self`.
+pub trait FromBits<T>: marker::Sized {
+    /// Safe lossless bitwise from `T` to `Self`.
+    fn from_bits(T) -> Self;
+}
+
+/// Safe lossless bitwise conversion from `Self` to `T`.
+pub trait IntoBits<T>: marker::Sized {
+    /// Safe lossless bitwise transmute from `self` to `T`.
+    fn into_bits(self) -> T;
+}
+
+// FromBits implies IntoBits
+impl<T, U> IntoBits<U> for T
+where
+    U: FromBits<T>,
+{
+    #[inline]
+    fn into_bits(self) -> U {
+        U::from_bits(self)
+    }
+}
+
+// FromBits (and thus IntoBits) is reflexive
+impl<T> FromBits<T> for T {
+    #[inline]
+    fn from_bits(t: Self) -> Self {
+        t
+    }
+}
diff --git a/coresimd/ppsv/v128.rs b/coresimd/ppsv/v128.rs
new file mode 100644
index 0000000000..8a1293a16b
--- /dev/null
+++ b/coresimd/ppsv/v128.rs
@@ -0,0 +1,338 @@
+//! 128-bit wide portable packed vector types.
+
+simd_api_imports!();
+
+use ::coresimd::simd::{b8x2, b8x4, b8x8};
+
+simd_i_ty! {
+    i8x16: 16, i8, b8x16, i8x16_tests |
+    i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8  |
+    x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 |
+    /// A 128-bit vector with 16 `i8` lanes.
+}
+
+simd_u_ty! {
+    u8x16: 16, u8, b8x16, u8x16_tests |
+    u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8 |
+    x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 |
+    /// A 128-bit vector with 16 `u8` lanes.
+}
+
+simd_b_ty! {
+    b8x16: 16, i8, b8x16_tests |
+    i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8  |
+    x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 |
+    /// A 128-bit vector with 16 `bool` lanes.
+}
+
+simd_i_ty! {
+    i16x8: 8, i16, b8x8, i16x8_tests |
+    i16, i16, i16, i16, i16, i16, i16, i16 |
+    x0, x1, x2, x3, x4, x5, x6, x7 |
+    /// A 128-bit vector with 8 `i16` lanes.
+}
+
+simd_u_ty! {
+    u16x8: 8, u16, b8x8, u16x8_tests |
+    u16, u16, u16, u16, u16, u16, u16, u16 |
+    x0, x1, x2, x3, x4, x5, x6, x7 |
+    /// A 128-bit vector with 8 `u16` lanes.
+}
+
+simd_i_ty! {
+    i32x4: 4, i32, b8x4, i32x4_tests |
+    i32, i32, i32, i32 |
+    x0, x1, x2, x3 |
+    /// A 128-bit vector with 4 `i32` lanes.
+}
+
+simd_u_ty! {
+    u32x4: 4, u32, b8x4, u32x4_tests |
+    u32, u32, u32, u32 |
+    x0, x1, x2, x3 |
+    /// A 128-bit vector with 4 `u32` lanes.
+}
+
+simd_f_ty! {
+    f32x4: 4, f32, b8x4, f32x4_tests |
+    f32, f32, f32, f32 |
+    x0, x1, x2, x3 |
+    /// A 128-bit vector with 4 `f32` lanes.
+}
+
+simd_i_ty! {
+    i64x2: 2, i64, b8x2, i64x2_tests |
+    i64, i64 |
+    x0, x1 |
+    /// A 128-bit vector with 2 `u64` lanes.
+}
+
+simd_u_ty! {
+    u64x2: 2, u64, b8x2, u64x2_tests |
+    u64, u64 |
+    x0, x1 |
+    /// A 128-bit vector with 2 `u64` lanes.
+}
+
+simd_f_ty! {
+    f64x2: 2, f64, b8x2, f64x2_tests |
+    f64, f64 |
+    x0, x1 |
+    /// A 128-bit vector with 2 `f64` lanes.
+}
+
+impl_from_bits!(
+    u64x2: u64,
+    u64x2_from_bits | i64x2,
+    f64x2,
+    u32x4,
+    i32x4,
+    f32x4,
+    u16x8,
+    i16x8,
+    u8x16,
+    i8x16,
+    b8x16
+);
+impl_from_bits!(
+    i64x2: i64,
+    i64x2_from_bits | u64x2,
+    f64x2,
+    u32x4,
+    i32x4,
+    f32x4,
+    u16x8,
+    i16x8,
+    u8x16,
+    i8x16,
+    b8x16
+);
+impl_from_bits!(
+    f64x2: f64,
+    f64x2_from_bits | i64x2,
+    u64x2,
+    u32x4,
+    i32x4,
+    f32x4,
+    u16x8,
+    i16x8,
+    u8x16,
+    i8x16,
+    b8x16
+);
+impl_from_bits!(
+    u32x4: u32,
+    u32x4_from_bits | u64x2,
+    i64x2,
+    f64x2,
+    i32x4,
+    f32x4,
+    u16x8,
+    i16x8,
+    u8x16,
+    i8x16,
+    b8x16
+);
+impl_from_bits!(
+    i32x4: i32,
+    i32x4_from_bits | u64x2,
+    i64x2,
+    f64x2,
+    u32x4,
+    f32x4,
+    u16x8,
+    i16x8,
+    u8x16,
+    i8x16,
+    b8x16
+);
+impl_from_bits!(
+    f32x4: f32,
+    f32x4_from_bits | u64x2,
+    i64x2,
+    f64x2,
+    i32x4,
+    u32x4,
+    u16x8,
+    i16x8,
+    u8x16,
+    i8x16,
+    b8x16
+);
+impl_from_bits!(
+    u16x8: u16,
+    u16x8_from_bits | u64x2,
+    i64x2,
+    f64x2,
+    u32x4,
+    i32x4,
+    f32x4,
+    i16x8,
+    u8x16,
+    i8x16,
+    b8x16
+);
+impl_from_bits!(
+    i16x8: i16,
+    i16x8_from_bits | u64x2,
+    i64x2,
+    f64x2,
+    u32x4,
+    i32x4,
+    f32x4,
+    u16x8,
+    u8x16,
+    i8x16,
+    b8x16
+);
+impl_from_bits!(
+    u8x16: u8,
+    u8x16_from_bits | u64x2,
+    i64x2,
+    f64x2,
+    u32x4,
+    i32x4,
+    f32x4,
+    u16x8,
+    i16x8,
+    i8x16,
+    b8x16
+);
+impl_from_bits!(
+    i8x16: i8,
+    i8x16_from_bits | u64x2,
+    i64x2,
+    f64x2,
+    u32x4,
+    i32x4,
+    f32x4,
+    u16x8,
+    i16x8,
+    u8x16,
+    b8x16
+);
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+use coresimd::x86::__m128;
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+use coresimd::x86::__m128i;
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+use coresimd::x86::__m128d;
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+impl_from_bits_!(f64x2: __m128, __m128i, __m128d);
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+impl_from_bits_!(u64x2: __m128, __m128i, __m128d);
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+impl_from_bits_!(i64x2: __m128, __m128i, __m128d);
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+impl_from_bits_!(f32x4: __m128, __m128i, __m128d);
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+impl_from_bits_!(u32x4: __m128, __m128i, __m128d);
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+impl_from_bits_!(i32x4: __m128, __m128i, __m128d);
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+impl_from_bits_!(u16x8: __m128, __m128i, __m128d);
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+impl_from_bits_!(i16x8: __m128, __m128i, __m128d);
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+impl_from_bits_!(u8x16: __m128, __m128i, __m128d);
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+impl_from_bits_!(i8x16: __m128, __m128i, __m128d);
+
+impl_from!(
+    f64x2: f64,
+    f64x2_from | f32x2,
+    u64x2,
+    i64x2,
+    u32x2,
+    i32x2,
+    u16x2,
+    i16x2,
+    u8x2,
+    i8x2
+);
+impl_from!(
+    f32x4: f32,
+    f32x4_from | f64x4,
+    u64x4,
+    i64x4,
+    u32x4,
+    i32x4,
+    u16x4,
+    i16x4,
+    u8x4,
+    i8x4
+);
+impl_from!(
+    u64x2: u64,
+    u64x2_from | f32x2,
+    f64x2,
+    i64x2,
+    i32x2,
+    u32x2,
+    i16x2,
+    u16x2,
+    i8x2,
+    u8x2
+);
+impl_from!(
+    i64x2: i64,
+    i64x2_from | f32x2,
+    f64x2,
+    u64x2,
+    i32x2,
+    u32x2,
+    i16x2,
+    u16x2,
+    i8x2,
+    u8x2
+);
+impl_from!(
+    u32x4: u32,
+    u32x4_from | f64x4,
+    u64x4,
+    i64x4,
+    f32x4,
+    i32x4,
+    u16x4,
+    i16x4,
+    u8x4,
+    i8x4
+);
+impl_from!(
+    i32x4: i32,
+    i32x4_from | f64x4,
+    u64x4,
+    i64x4,
+    f32x4,
+    u32x4,
+    u16x4,
+    i16x4,
+    u8x4,
+    i8x4
+);
+impl_from!(
+    i16x8: i16,
+    i16x8_from | f64x8,
+    u64x8,
+    i64x8,
+    f32x8,
+    u32x8,
+    i32x8,
+    u16x8,
+    u8x8,
+    i8x8
+);
+impl_from!(
+    u16x8: u16,
+    u16x8_from | f64x8,
+    u64x8,
+    i64x8,
+    f32x8,
+    u32x8,
+    i32x8,
+    i16x8,
+    u8x8,
+    i8x8
+);
diff --git a/coresimd/ppsv/v16.rs b/coresimd/ppsv/v16.rs
new file mode 100644
index 0000000000..5bde9079f9
--- /dev/null
+++ b/coresimd/ppsv/v16.rs
@@ -0,0 +1,50 @@
+//! 16-bit wide portable packed vector types.
+
+simd_api_imports!();
+
+simd_i_ty! {
+    i8x2: 2, i8, b8x2, i8x2_tests |
+    i8, i8 |
+    x0, x1 |
+    /// A 16-bit wide vector with 2 `i8` lanes.
+}
+
+simd_u_ty! {
+    u8x2: 2, u8, b8x2, u8x2_tests |
+    u8, u8 |
+    x0, x1 |
+    /// A 16-bit wide vector with 2 `u8` lanes.
+}
+
+simd_b_ty! {
+    b8x2: 2, i8, b8x2_tests |
+    i8, i8 |
+    x0, x1 |
+    /// A 16-bit wide vector with 2 `bool` lanes.
+}
+
+impl_from_bits!(i8x2: i8, i8x2_from_bits | u8x2, b8x2);
+impl_from_bits!(u8x2: u8, u8x2_from_bits | i8x2, b8x2);
+
+impl_from!(
+    i8x2: i8,
+    i8x2_from | f64x2,
+    u64x2,
+    i64x2,
+    f32x2,
+    u32x2,
+    i32x2,
+    u16x2,
+    u8x2
+);
+impl_from!(
+    u8x2: u8,
+    u8x2_from | f64x2,
+    u64x2,
+    i64x2,
+    f32x2,
+    u32x2,
+    i32x2,
+    u16x2,
+    i8x2
+);
diff --git a/coresimd/ppsv/v256.rs b/coresimd/ppsv/v256.rs
new file mode 100644
index 0000000000..da7cdb92bd
--- /dev/null
+++ b/coresimd/ppsv/v256.rs
@@ -0,0 +1,350 @@
+//! 256-bit wide portable packed vector types.
+
+simd_api_imports!();
+
+use ::coresimd::simd::{b8x16, b8x8, b8x4};
+
+simd_i_ty! {
+    i8x32: 32, i8, b8x32, i8x32_tests |
+    i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8,
+    i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 |
+    x0, x1, x2, x3, x4, x5, x6, x7,
+    x8, x9, x10, x11, x12, x13, x14, x15,
+    x16, x17, x18, x19, x20, x21, x22, x23,
+    x24, x25, x26, x27, x28, x29, x30, x31 |
+    /// A 256-bit vector with 32 `i8` lanes.
+}
+
+simd_u_ty! {
+    u8x32: 32, u8, b8x32, u8x32_tests |
+    u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8,
+    u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8 |
+    x0, x1, x2, x3, x4, x5, x6, x7,
+    x8, x9, x10, x11, x12, x13, x14, x15,
+    x16, x17, x18, x19, x20, x21, x22, x23,
+    x24, x25, x26, x27, x28, x29, x30, x31 |
+    /// A 256-bit vector with 32 `u8` lanes.
+}
+
+simd_b_ty! {
+    b8x32: 32, i8, b8x32_tests |
+    i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8,
+    i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 |
+    x0, x1, x2, x3, x4, x5, x6, x7,
+    x8, x9, x10, x11, x12, x13, x14, x15,
+    x16, x17, x18, x19, x20, x21, x22, x23,
+    x24, x25, x26, x27, x28, x29, x30, x31 |
+    /// A 256-bit vector with 32 `bool` lanes.
+}
+
+simd_i_ty! {
+    i16x16: 16, i16, b8x16, i16x16_tests |
+    i16, i16, i16, i16, i16, i16, i16, i16,
+    i16, i16, i16, i16, i16, i16, i16, i16 |
+    x0, x1, x2, x3, x4, x5, x6, x7,
+    x8, x9, x10, x11, x12, x13, x14, x15 |
+    /// A 256-bit vector with 16 `i16` lanes.
+}
+
+simd_u_ty! {
+    u16x16: 16, u16, b8x16, u16x16_tests |
+    u16, u16, u16, u16, u16, u16, u16, u16,
+    u16, u16, u16, u16, u16, u16, u16, u16 |
+    x0, x1, x2, x3, x4, x5, x6, x7,
+    x8, x9, x10, x11, x12, x13, x14, x15 |
+    /// A 256-bit vector with 16 `u16` lanes.
+}
+
+simd_i_ty! {
+    i32x8: 8, i32, b8x8, i32x8_tests |
+    i32, i32, i32, i32, i32, i32, i32, i32 |
+    x0, x1, x2, x3, x4, x5, x6, x7 |
+    /// A 256-bit vector with 8 `i32` lanes.
+}
+
+simd_u_ty! {
+    u32x8: 8, u32, b8x8, u32x8_tests |
+    u32, u32, u32, u32, u32, u32, u32, u32 |
+    x0, x1, x2, x3, x4, x5, x6, x7 |
+    /// A 256-bit vector with 8 `u32` lanes.
+}
+
+simd_f_ty! {
+    f32x8: 8, f32, b8x8, f32x8_tests |
+    f32, f32, f32, f32, f32, f32, f32, f32 |
+    x0, x1, x2, x3, x4, x5, x6, x7 |
+    /// A 256-bit vector with 8 `f32` lanes.
+}
+
+simd_i_ty! {
+    i64x4: 4, i64, b8x4, i64x4_tests |
+    i64, i64, i64, i64 |
+    x0, x1, x2, x3 |
+    /// A 256-bit vector with 4 `i64` lanes.
+}
+
+simd_u_ty! {
+    u64x4: 4, u64, b8x4, u64x4_tests |
+    u64, u64, u64, u64 |
+    x0, x1, x2, x3 |
+    /// A 256-bit vector with 4 `u64` lanes.
+}
+
+simd_f_ty! {
+    f64x4: 4, f64, b8x4, f64x4_tests |
+    f64, f64, f64, f64 |
+    x0, x1, x2, x3 |
+    /// A 256-bit vector with 4 `f64` lanes.
+}
+
+impl_from_bits!(
+    i8x32: i8,
+    i8x32_from_bits | u64x4,
+    i64x4,
+    f64x4,
+    u32x8,
+    i32x8,
+    f32x8,
+    u16x16,
+    i16x16,
+    u8x32,
+    b8x32
+);
+impl_from_bits!(
+    u8x32: u8,
+    u8x32_from_bits | u64x4,
+    i64x4,
+    f64x4,
+    u32x8,
+    i32x8,
+    f32x8,
+    u16x16,
+    i16x16,
+    i8x32,
+    b8x32
+);
+impl_from_bits!(
+    i16x16: i16,
+    i16x16_from_bits | u64x4,
+    i64x4,
+    f64x4,
+    u32x8,
+    i32x8,
+    f32x8,
+    u16x16,
+    u8x32,
+    i8x32,
+    b8x32
+);
+impl_from_bits!(
+    u16x16: u16,
+    u16x16_from_bits | u64x4,
+    i64x4,
+    f64x4,
+    u32x8,
+    i32x8,
+    f32x8,
+    i16x16,
+    u8x32,
+    i8x32,
+    b8x32
+);
+impl_from_bits!(
+    i32x8: i32,
+    i32x8_from_bits | u64x4,
+    i64x4,
+    f64x4,
+    u32x8,
+    f32x8,
+    u16x16,
+    i16x16,
+    u8x32,
+    i8x32,
+    b8x32
+);
+impl_from_bits!(
+    u32x8: u32,
+    u32x8_from_bits | u64x4,
+    i64x4,
+    f64x4,
+    i32x8,
+    f32x8,
+    u16x16,
+    i16x16,
+    u8x32,
+    i8x32,
+    b8x32
+);
+impl_from_bits!(
+    f32x8: f32,
+    f32x8_from_bits | u64x4,
+    i64x4,
+    f64x4,
+    i32x8,
+    u32x8,
+    u16x16,
+    i16x16,
+    u8x32,
+    i8x32,
+    b8x32
+);
+impl_from_bits!(
+    i64x4: i64,
+    i64x4_from_bits | u64x4,
+    f64x4,
+    i32x8,
+    u32x8,
+    f32x8,
+    u16x16,
+    i16x16,
+    u8x32,
+    i8x32,
+    b8x32
+);
+impl_from_bits!(
+    u64x4: u64,
+    u64x4_from_bits | i64x4,
+    f64x4,
+    i32x8,
+    u32x8,
+    f32x8,
+    u16x16,
+    i16x16,
+    u8x32,
+    i8x32,
+    b8x32
+);
+impl_from_bits!(
+    f64x4: f64,
+    f64x4_from_bits | i64x4,
+    u64x4,
+    i32x8,
+    u32x8,
+    f32x8,
+    u16x16,
+    i16x16,
+    u8x32,
+    i8x32,
+    b8x32
+);
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+use coresimd::x86::__m256;
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+use coresimd::x86::__m256i;
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+use coresimd::x86::__m256d;
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+impl_from_bits_!(f64x4: __m256, __m256i, __m256d);
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+impl_from_bits_!(u64x4: __m256, __m256i, __m256d);
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+impl_from_bits_!(i64x4: __m256, __m256i, __m256d);
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+impl_from_bits_!(f32x8: __m256, __m256i, __m256d);
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+impl_from_bits_!(u32x8: __m256, __m256i, __m256d);
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+impl_from_bits_!(i32x8: __m256, __m256i, __m256d);
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+impl_from_bits_!(u16x16: __m256, __m256i, __m256d);
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+impl_from_bits_!(i16x16: __m256, __m256i, __m256d);
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+impl_from_bits_!(u8x32: __m256, __m256i, __m256d);
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+impl_from_bits_!(i8x32: __m256, __m256i, __m256d);
+
+impl_from!(
+    f64x4: f64,
+    f64x4_from | u64x4,
+    i64x4,
+    u32x4,
+    i32x4,
+    f32x4,
+    u16x4,
+    i16x4,
+    u8x4,
+    i8x4
+);
+impl_from!(
+    i64x4: i64,
+    i64x4_from | u64x4,
+    f64x4,
+    u32x4,
+    i32x4,
+    f32x4,
+    u16x4,
+    i16x4,
+    u8x4,
+    i8x4
+);
+impl_from!(
+    u64x4: u64,
+    u64x4_from | i64x4,
+    f64x4,
+    u32x4,
+    i32x4,
+    f32x4,
+    u16x4,
+    i16x4,
+    u8x4,
+    i8x4
+);
+impl_from!(
+    f32x8: f32,
+    f32x8_from | u64x8,
+    i64x8,
+    f64x8,
+    u32x8,
+    i32x8,
+    u16x8,
+    i16x8,
+    u8x8,
+    i8x8
+);
+impl_from!(
+    i32x8: i32,
+    i32x8_from | u64x8,
+    i64x8,
+    f64x8,
+    u32x8,
+    f32x8,
+    u16x8,
+    i16x8,
+    u8x8,
+    i8x8
+);
+impl_from!(
+    u32x8: u32,
+    u32x8_from | u64x8,
+    i64x8,
+    f64x8,
+    i32x8,
+    f32x8,
+    u16x8,
+    i16x8,
+    u8x8,
+    i8x8
+);
+impl_from!(
+    i16x16: i16,
+    i16x16_from | u32x16,
+    i32x16,
+    f32x16,
+    u16x16,
+    u8x16,
+    i8x16
+);
+impl_from!(
+    u16x16: u16,
+    u16x16_from | u32x16,
+    i32x16,
+    f32x16,
+    i16x16,
+    u8x16,
+    i8x16
+);
+impl_from!(i8x32: i8, i8x32_from | u16x32, i16x32, u8x32);
+impl_from!(u8x32: u8, u8x32_from | u16x32, i16x32, i8x32);
diff --git a/coresimd/ppsv/v32.rs b/coresimd/ppsv/v32.rs
new file mode 100644
index 0000000000..be772bd6a8
--- /dev/null
+++ b/coresimd/ppsv/v32.rs
@@ -0,0 +1,96 @@
+//! 32-bit wide portable packed vector types.
+
+simd_api_imports!();
+use ::coresimd::simd::{b8x2};
+
+simd_i_ty! {
+    i16x2: 2, i16, b8x2, i16x2_tests |
+    i16, i16 |
+    x0, x1 |
+    /// A 32-bit wide vector with 2 `i16` lanes.
+}
+
+simd_u_ty! {
+    u16x2: 2, u16, b8x2, u16x2_tests |
+    u16, u16 |
+    x0, x1 |
+    /// A 32-bit wide vector with 2 `u16` lanes.
+}
+
+simd_i_ty! {
+    i8x4: 4, i8, b8x4, i8x4_tests |
+    i8, i8, i8, i8  |
+    x0, x1, x2, x3 |
+    /// A 32-bit wide vector with 4 `i8` lanes.
+}
+
+simd_u_ty! {
+    u8x4: 4, u8, b8x4, u8x4_tests |
+    u8, u8, u8, u8  |
+    x0, x1, x2, x3 |
+    /// A 32-bit wide vector with 4 `u8` lanes.
+}
+
+simd_b_ty! {
+    b8x4: 4, i8, b8x4_tests |
+    i8, i8, i8, i8  |
+    x0, x1, x2, x3 |
+    /// A 32-bit wide vector with 4 `bool` lanes.
+}
+
+impl_from_bits!(i16x2: i16, i16x2_from_bits | u16x2, i8x4, u8x4, b8x4);
+impl_from_bits!(u16x2: u16, u16x2_from_bits | i16x2, i8x4, u8x4, b8x4);
+impl_from_bits!(i8x4: i8, i8x2_from_bits | i16x2, u16x2, u8x4, b8x4);
+impl_from_bits!(u8x4: u8, u8x2_from_bits | i16x2, u16x2, i8x4, b8x4);
+
+impl_from!(
+    i16x2: i16,
+    i16x2_from | f64x2,
+    u64x2,
+    i64x2,
+    f32x2,
+    u32x2,
+    i32x2,
+    u16x2,
+    u8x2,
+    i8x2
+);
+
+impl_from!(
+    u16x2: u16,
+    u16x2_from | f64x2,
+    u64x2,
+    i64x2,
+    f32x2,
+    u32x2,
+    i32x2,
+    i16x2,
+    u8x2,
+    i8x2
+);
+
+impl_from!(
+    i8x4: i8,
+    i8x4_from | f64x4,
+    u64x4,
+    i64x4,
+    u32x4,
+    i32x4,
+    f32x4,
+    u16x4,
+    i16x4,
+    u8x4
+);
+
+impl_from!(
+    u8x4: u8,
+    u8x4_from | f64x4,
+    u64x4,
+    i64x4,
+    u32x4,
+    i32x4,
+    f32x4,
+    u16x4,
+    i16x4,
+    i8x4
+);
diff --git a/coresimd/ppsv/v512.rs b/coresimd/ppsv/v512.rs
new file mode 100644
index 0000000000..c9e1c0e7d2
--- /dev/null
+++ b/coresimd/ppsv/v512.rs
@@ -0,0 +1,331 @@
+//! 512-bit wide portable packed vector types.
+
+simd_api_imports!();
+
+use ::coresimd::simd::{b8x32, b8x16, b8x8};
+
+simd_i_ty! {
+    i8x64: 64, i8, b8x64, i8x64_tests |
+    i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8,
+    i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8,
+    i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8,
+    i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 |
+    x0, x1, x2, x3, x4, x5, x6, x7,
+    x8, x9, x10, x11, x12, x13, x14, x15,
+    x16, x17, x18, x19, x20, x21, x22, x23,
+    x24, x25, x26, x27, x28, x29, x30, x31,
+    x32, x33, x34, x35, x36, x37, x38, x39,
+    x40, x41, x42, x43, x44, x45, x46, x47,
+    x48, x49, x50, x51, x52, x53, x54, x55,
+    x56, x57, x58, x59, x60, x61, x62, x63 |
+    /// A 512-bit vector with 64 `i8` lanes.
+}
+
+simd_u_ty! {
+    u8x64: 64, u8, b8x64, u8x64_tests |
+    u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8,
+    u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8,
+    u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8,
+    u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8 |
+    x0, x1, x2, x3, x4, x5, x6, x7,
+    x8, x9, x10, x11, x12, x13, x14, x15,
+    x16, x17, x18, x19, x20, x21, x22, x23,
+    x24, x25, x26, x27, x28, x29, x30, x31,
+    x32, x33, x34, x35, x36, x37, x38, x39,
+    x40, x41, x42, x43, x44, x45, x46, x47,
+    x48, x49, x50, x51, x52, x53, x54, x55,
+    x56, x57, x58, x59, x60, x61, x62, x63 |
+    /// A 512-bit vector with 64 `u8` lanes.
+}
+
+simd_b_ty! {
+    b8x64: 64, i8, b8x64_tests |
+    i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8,
+    i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8,
+    i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8,
+    i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 |
+    x0, x1, x2, x3, x4, x5, x6, x7,
+    x8, x9, x10, x11, x12, x13, x14, x15,
+    x16, x17, x18, x19, x20, x21, x22, x23,
+    x24, x25, x26, x27, x28, x29, x30, x31,
+    x32, x33, x34, x35, x36, x37, x38, x39,
+    x40, x41, x42, x43, x44, x45, x46, x47,
+    x48, x49, x50, x51, x52, x53, x54, x55,
+    x56, x57, x58, x59, x60, x61, x62, x63 |
+    /// A 512-bit vector with 64 `bool` lanes.
+}
+
+simd_i_ty! {
+    i16x32: 32, i16, b8x32, i16x32_tests |
+    i16, i16, i16, i16, i16, i16, i16, i16,
+    i16, i16, i16, i16, i16, i16, i16, i16,
+    i16, i16, i16, i16, i16, i16, i16, i16,
+    i16, i16, i16, i16, i16, i16, i16, i16 |
+    x0, x1, x2, x3, x4, x5, x6, x7,
+    x8, x9, x10, x11, x12, x13, x14, x15,
+    x16, x17, x18, x19, x20, x21, x22, x23,
+    x24, x25, x26, x27, x28, x29, x30, x31 |
+    /// A 512-bit vector with 32 `i16` lanes.
+}
+
+simd_u_ty! {
+    u16x32: 32, u16, b8x32, u16x32_tests |
+    u16, u16, u16, u16, u16, u16, u16, u16,
+    u16, u16, u16, u16, u16, u16, u16, u16,
+    u16, u16, u16, u16, u16, u16, u16, u16,
+    u16, u16, u16, u16, u16, u16, u16, u16 |
+    x0, x1, x2, x3, x4, x5, x6, x7,
+    x8, x9, x10, x11, x12, x13, x14, x15,
+    x16, x17, x18, x19, x20, x21, x22, x23,
+    x24, x25, x26, x27, x28, x29, x30, x31 |
+    /// A 512-bit vector with 32 `u16` lanes.
+}
+simd_i_ty! {
+    i32x16: 16, i32, b8x16, i32x16_tests |
+    i32, i32, i32, i32, i32, i32, i32, i32,
+    i32, i32, i32, i32, i32, i32, i32, i32 |
+    x0, x1, x2, x3, x4, x5, x6, x7,
+    x8, x9, x10, x11, x12, x13, x14, x15 |
+    /// A 512-bit vector with 16 `i32` lanes.
+}
+
+simd_u_ty! {
+    u32x16: 16, u32, b8x16, u32x16_tests |
+    u32, u32, u32, u32, u32, u32, u32, u32,
+    u32, u32, u32, u32, u32, u32, u32, u32 |
+    x0, x1, x2, x3, x4, x5, x6, x7,
+    x8, x9, x10, x11, x12, x13, x14, x15 |
+    /// A 512-bit vector with 16 `u32` lanes.
+}
+
+simd_f_ty! {
+    f32x16: 16, f32, b8x16, f32x16_tests |
+    f32, f32, f32, f32, f32, f32, f32, f32,
+    f32, f32, f32, f32, f32, f32, f32, f32 |
+    x0, x1, x2, x3, x4, x5, x6, x7,
+    x8, x9, x10, x11, x12, x13, x14, x15 |
+    /// A 512-bit vector with 16 `f32` lanes.
+}
+
+simd_i_ty! {
+    i64x8: 8, i64, b8x8, i64x8_tests |
+    i64, i64, i64, i64, i64, i64, i64, i64 |
+    x0, x1, x2, x3, x4, x5, x6, x7 |
+    /// A 512-bit vector with 8 `i64` lanes.
+}
+
+simd_u_ty! {
+    u64x8: 8, u64, b8x8, u64x8_tests |
+    u64, u64, u64, u64, u64, u64, u64, u64 |
+    x0, x1, x2, x3, x4, x5, x6, x7 |
+    /// A 512-bit vector with 8 `u64` lanes.
+}
+
+simd_f_ty! {
+    f64x8: 8, f64, b8x8, f64x8_tests |
+    f64, f64, f64, f64, f64, f64, f64, f64 |
+    x0, x1, x2, x3, x4, x5, x6, x7 |
+    /// A 512-bit vector with 8 `f64` lanes.
+}
+
+impl_from_bits!(
+    i8x64: i8,
+    i8x64_from_bits | u64x8,
+    i64x8,
+    f64x8,
+    u32x16,
+    i32x16,
+    f32x16,
+    u16x32,
+    i16x32,
+    u8x64,
+    b8x64
+);
+impl_from_bits!(
+    u8x64: u8,
+    u8x64_from_bits | u64x8,
+    i64x8,
+    f64x8,
+    u32x16,
+    i32x16,
+    f32x16,
+    u16x32,
+    i16x32,
+    i8x64,
+    b8x64
+);
+impl_from_bits!(
+    i16x32: i16,
+    i16x32_from_bits | u64x8,
+    i64x8,
+    f64x8,
+    u32x16,
+    i32x16,
+    f32x16,
+    u16x32,
+    i8x64,
+    u8x64,
+    b8x64
+);
+impl_from_bits!(
+    u16x32: u16,
+    u16x32_from_bits | u64x8,
+    i64x8,
+    f64x8,
+    u32x16,
+    i32x16,
+    f32x16,
+    i16x32,
+    i8x64,
+    u8x64,
+    b8x64
+);
+impl_from_bits!(
+    i32x16: i32,
+    i32x16_from_bits | u64x8,
+    i64x8,
+    f64x8,
+    u32x16,
+    f32x16,
+    u16x32,
+    i16x32,
+    i8x64,
+    u8x64,
+    b8x64
+);
+impl_from_bits!(
+    u32x16: u32,
+    u32x16_from_bits | u64x8,
+    i64x8,
+    f64x8,
+    i32x16,
+    f32x16,
+    u16x32,
+    i16x32,
+    i8x64,
+    u8x64,
+    b8x64
+);
+impl_from_bits!(
+    f32x16: f32,
+    f32x16_from_bits | u64x8,
+    i64x8,
+    f64x8,
+    u32x16,
+    i32x16,
+    u16x32,
+    i16x32,
+    i8x64,
+    u8x64,
+    b8x64
+);
+impl_from_bits!(
+    i64x8: i64,
+    i64x8_from_bits | u64x8,
+    f64x8,
+    u32x16,
+    i32x16,
+    f32x16,
+    u16x32,
+    i16x32,
+    i8x64,
+    u8x64,
+    b8x64
+);
+impl_from_bits!(
+    u64x8: u64,
+    u64x8_from_bits | i64x8,
+    f64x8,
+    u32x16,
+    i32x16,
+    f32x16,
+    u16x32,
+    i16x32,
+    i8x64,
+    u8x64,
+    b8x64
+);
+impl_from_bits!(
+    f64x8: f64,
+    f64x8_from_bits | u64x8,
+    i64x8,
+    u32x16,
+    i32x16,
+    f32x16,
+    u16x32,
+    i16x32,
+    i8x64,
+    u8x64,
+    b8x64
+);
+
+impl_from!(
+    f64x8: f64,
+    f64x8_from | u64x8,
+    i64x8,
+    u32x8,
+    i32x8,
+    f32x8,
+    u16x8,
+    i16x8,
+    u8x8,
+    i8x8
+);
+impl_from!(
+    i64x8: i64,
+    i64x8_from | u64x8,
+    f64x8,
+    u32x8,
+    i32x8,
+    f32x8,
+    u16x8,
+    i16x8,
+    u8x8,
+    i8x8
+);
+impl_from!(
+    u64x8: u64,
+    u64x8_from | i64x8,
+    f64x8,
+    u32x8,
+    i32x8,
+    f32x8,
+    u16x8,
+    i16x8,
+    u8x8,
+    i8x8
+);
+
+impl_from!(
+    f32x16: f32,
+    f32x16_from | u32x16,
+    i32x16,
+    u16x16,
+    i16x16,
+    u8x16,
+    i8x16
+);
+impl_from!(
+    i32x16: i32,
+    i32x16_from | u32x16,
+    f32x16,
+    u16x16,
+    i16x16,
+    u8x16,
+    i8x16
+);
+impl_from!(
+    u32x16: u32,
+    u32x16_from | i32x16,
+    f32x16,
+    u16x16,
+    i16x16,
+    u8x16,
+    i8x16
+);
+
+impl_from!(i16x32: i16, i16x32_from | u16x32, u8x32, i8x32);
+impl_from!(u16x32: u16, u16x32_from | i16x32, u8x32, i8x32);
+
+impl_from!(i8x64: i8, i8x64_from | u8x64);
+impl_from!(u8x64: u8, u8x64_from | i8x64);
diff --git a/coresimd/ppsv/v64.rs b/coresimd/ppsv/v64.rs
new file mode 100644
index 0000000000..874a12f45a
--- /dev/null
+++ b/coresimd/ppsv/v64.rs
@@ -0,0 +1,235 @@
+//! 64-bit wide portable packed vector types.
+
+simd_api_imports!();
+
+use ::coresimd::simd::{b8x4, b8x2};
+
+simd_i_ty! {
+    i8x8: 8, i8, b8x8, i8x8_tests |
+    i8, i8, i8, i8, i8, i8, i8, i8 |
+    x0, x1, x2, x3, x4, x5, x6, x7 |
+    /// A 64-bit vector with 8 `i8` lanes.
+}
+
+simd_u_ty! {
+    u8x8: 8, u8, b8x8, u8x8_tests |
+    u8, u8, u8, u8, u8, u8, u8, u8 |
+    x0, x1, x2, x3, x4, x5, x6, x7 |
+    /// A 64-bit vector with 8 `u8` lanes.
+}
+
+simd_b_ty! {
+    b8x8: 8, i8, b8x8_tests |
+    i8, i8, i8, i8, i8, i8, i8, i8 |
+    x0, x1, x2, x3, x4, x5, x6, x7 |
+    /// A 64-bit vector with 8 `bool` lanes.
+}
+
+simd_i_ty! {
+    i16x4: 4, i16, b8x4, i16x4_tests |
+    i16, i16, i16, i16 |
+    x0, x1, x2, x3 |
+    /// A 64-bit vector with 4 `i16` lanes.
+}
+
+simd_u_ty! {
+    u16x4: 4, u16, b8x4, u16x4_tests |
+    u16, u16, u16, u16 |
+    x0, x1, x2, x3 |
+    /// A 64-bit vector with 4 `u16` lanes.
+}
+
+simd_i_ty! {
+    i32x2: 2, i32, b8x2, i32x2_tests |
+    i32, i32 |
+    x0, x1 |
+    /// A 64-bit vector with 2 `i32` lanes.
+}
+
+simd_u_ty! {
+    u32x2: 2, u32, b8x2, u32x2_tests |
+    u32, u32 |
+    x0, x1 |
+    /// A 64-bit vector with 2 `u32` lanes.
+}
+
+simd_f_ty! {
+    f32x2: 2, f32, b8x2, f32x2_tests |
+    f32, f32 |
+    x0, x1 |
+    /// A 64-bit vector with 2 `f32` lanes.
+}
+
+impl_from_bits!(
+    u32x2: u32,
+    u32x2_from_bits | i32x2,
+    f32x2,
+    u16x4,
+    i16x4,
+    u8x8,
+    i8x8,
+    b8x8
+);
+impl_from_bits!(
+    i32x2: i32,
+    i32x2_from_bits | u32x2,
+    f32x2,
+    u16x4,
+    i16x4,
+    u8x8,
+    i8x8,
+    b8x8
+);
+impl_from_bits!(
+    f32x2: f32,
+    f32x2_from_bits | i32x2,
+    u32x2,
+    u16x4,
+    i16x4,
+    u8x8,
+    i8x8,
+    b8x8
+);
+impl_from_bits!(
+    u16x4: u16,
+    u16x4_from_bits | u32x2,
+    i32x2,
+    i16x4,
+    u8x8,
+    i8x8,
+    b8x8
+);
+impl_from_bits!(
+    i16x4: i16,
+    i16x4_from_bits | u32x2,
+    i32x2,
+    u16x4,
+    u8x8,
+    i8x8,
+    b8x8
+);
+impl_from_bits!(
+    u8x8: u8,
+    u8x8_from_bits | u32x2,
+    i32x2,
+    u16x4,
+    i16x4,
+    i8x8,
+    b8x8
+);
+impl_from_bits!(
+    i8x8: i8,
+    i8x8_from_bits | u32x2,
+    i32x2,
+    u16x4,
+    i16x4,
+    u8x8,
+    b8x8
+);
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+use coresimd::x86::__m64;
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+impl_from_bits_!(f32x2: __m64);
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+impl_from_bits_!(u32x2: __m64);
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+impl_from_bits_!(i32x2: __m64);
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+impl_from_bits_!(u16x4: __m64);
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+impl_from_bits_!(i16x4: __m64);
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+impl_from_bits_!(u8x8: __m64);
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+impl_from_bits_!(i8x8: __m64);
+
+impl_from!(
+    f32x2: f32,
+    f32x2_from | f64x2,
+    u64x2,
+    i64x2,
+    u32x2,
+    i32x2,
+    u16x2,
+    i16x2,
+    u8x2,
+    i8x2
+);
+
+impl_from!(
+    u32x2: u32,
+    u32x2_from | f64x2,
+    u64x2,
+    i64x2,
+    f32x2,
+    i32x2,
+    u16x2,
+    i16x2,
+    u8x2,
+    i8x2
+);
+
+impl_from!(
+    i32x2: i32,
+    i32x2_from | f64x2,
+    u64x2,
+    i64x2,
+    f32x2,
+    u32x2,
+    u16x2,
+    i16x2,
+    u8x2,
+    i8x2
+);
+
+impl_from!(
+    u16x4: u16,
+    u16x4_from | f64x4,
+    u64x4,
+    i64x4,
+    f32x4,
+    i32x4,
+    u32x4,
+    i16x4,
+    u8x4,
+    i8x4
+);
+
+impl_from!(
+    i16x4: i16,
+    i16x4_from | f64x4,
+    u64x4,
+    i64x4,
+    f32x4,
+    i32x4,
+    u32x4,
+    u16x4,
+    u8x4,
+    i8x4
+);
+impl_from!(
+    i8x8: i8,
+    i8x8_from | f64x8,
+    u64x8,
+    i64x8,
+    f32x8,
+    u32x8,
+    i32x8,
+    i16x8,
+    u16x8,
+    u8x8
+);
+impl_from!(
+    u8x8: u8,
+    u8x8_from | f64x8,
+    u64x8,
+    i64x8,
+    f32x8,
+    u32x8,
+    i32x8,
+    i16x8,
+    u16x8,
+    i8x8
+);
diff --git a/coresimd/v128.rs b/coresimd/v128.rs
deleted file mode 100644
index ccdfc7f662..0000000000
--- a/coresimd/v128.rs
+++ /dev/null
@@ -1,111 +0,0 @@
-//! 128-bit wide vector types
-
-use prelude::v1::*;
-
-use coresimd::simd_llvm::*;
-
-define_ty! { f64x2, f64, f64 }
-define_impl! { f64x2, f64, 2, i64x2, x0, x1 }
-
-define_ty! { f32x4, f32, f32, f32, f32 }
-define_impl! { f32x4, f32, 4, i32x4, x0, x1, x2, x3 }
-
-define_ty! { u64x2, u64, u64 }
-define_impl! { u64x2, u64, 2, i64x2, x0, x1 }
-
-define_ty! { i64x2, i64, i64 }
-define_impl! { i64x2, i64, 2, i64x2, x0, x1 }
-
-define_ty! { u32x4, u32, u32, u32, u32 }
-define_impl! { u32x4, u32, 4, i32x4, x0, x1, x2, x3 }
-
-define_ty! { i32x4, i32, i32, i32, i32 }
-define_impl! { i32x4, i32, 4, i32x4, x0, x1, x2, x3 }
-
-define_ty! { u16x8, u16, u16, u16, u16, u16, u16, u16, u16 }
-define_impl! { u16x8, u16, 8, i16x8, x0, x1, x2, x3, x4, x5, x6, x7 }
-
-define_ty! { i16x8, i16, i16, i16, i16, i16, i16, i16, i16 }
-define_impl! { i16x8, i16, 8, i16x8, x0, x1, x2, x3, x4, x5, x6, x7 }
-
-define_ty! {
-    u8x16, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8
-}
-define_impl! {
-    u8x16, u8, 16, i8x16,
-    x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15
-}
-
-define_ty! {
-    i8x16, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8
-}
-define_impl! {
-    i8x16, i8, 16, i8x16,
-    x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15
-}
-
-define_from!(u64x2, i64x2, u32x4, i32x4, u16x8, i16x8, u8x16, i8x16);
-define_from!(i64x2, u64x2, u32x4, i32x4, u16x8, i16x8, u8x16, i8x16);
-define_from!(u32x4, u64x2, i64x2, i32x4, u16x8, i16x8, u8x16, i8x16);
-define_from!(i32x4, u64x2, i64x2, u32x4, u16x8, i16x8, u8x16, i8x16);
-define_from!(u16x8, u64x2, i64x2, u32x4, i32x4, i16x8, u8x16, i8x16);
-define_from!(i16x8, u64x2, i64x2, u32x4, i32x4, u16x8, u8x16, i8x16);
-define_from!(u8x16, u64x2, i64x2, u32x4, i32x4, u16x8, i16x8, i8x16);
-define_from!(i8x16, u64x2, i64x2, u32x4, i32x4, u16x8, i16x8, u8x16);
-
-define_common_ops!(
-    f64x2,
-    f32x4,
-    u64x2,
-    i64x2,
-    u32x4,
-    i32x4,
-    u16x8,
-    i16x8,
-    u8x16,
-    i8x16
-);
-define_float_ops!(f64x2, f32x4);
-define_integer_ops!(
-    (u64x2, u64),
-    (i64x2, i64),
-    (u32x4, u32),
-    (i32x4, i32),
-    (u16x8, u16),
-    (i16x8, i16),
-    (u8x16, u8),
-    (i8x16, i8)
-);
-define_signed_integer_ops!(i64x2, i32x4, i16x8, i8x16);
-define_casts!(
-    (f64x2, f32x2, as_f32x2),
-    (f64x2, u64x2, as_u64x2),
-    (f64x2, i64x2, as_i64x2),
-    (f32x4, f64x4, as_f64x4),
-    (f32x4, u32x4, as_u32x4),
-    (f32x4, i32x4, as_i32x4),
-    (u64x2, f64x2, as_f64x2),
-    (u64x2, i64x2, as_i64x2),
-    (i64x2, f64x2, as_f64x2),
-    (i64x2, u64x2, as_u64x2),
-    (u32x4, f32x4, as_f32x4),
-    (u32x4, i32x4, as_i32x4),
-    (i32x4, f32x4, as_f32x4),
-    (i32x4, u32x4, as_u32x4),
-    (u16x8, i16x8, as_i16x8),
-    (i16x8, u16x8, as_u16x8),
-    (u8x16, i8x16, as_i8x16),
-    (i8x16, u8x16, as_u8x16)
-);
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn operators() {
-        test_ops_si!(i8x16, i16x8, i32x4, i64x2);
-        test_ops_ui!(u8x16, u16x8, u32x4, u64x2);
-        test_ops_f!(f32x4, f64x2);
-    }
-}
diff --git a/coresimd/v256.rs b/coresimd/v256.rs
deleted file mode 100644
index 384ac4ed04..0000000000
--- a/coresimd/v256.rs
+++ /dev/null
@@ -1,134 +0,0 @@
-//! 256-bit wide vector types
-
-use prelude::v1::*;
-
-use coresimd::simd_llvm::*;
-
-define_ty! { f64x4, f64, f64, f64, f64 }
-define_impl! { f64x4, f64, 4, i64x4, x0, x1, x2, x3 }
-
-define_ty! { f32x8, f32, f32, f32, f32, f32, f32, f32, f32 }
-define_impl! { f32x8, f32, 8, i32x8, x0, x1, x2, x3, x4, x5, x6, x7 }
-
-define_ty! { u64x4, u64, u64, u64, u64 }
-define_impl! { u64x4, u64, 4, i64x4, x0, x1, x2, x3 }
-
-define_ty! { i64x4, i64, i64, i64, i64 }
-define_impl! { i64x4, i64, 4, i64x4, x0, x1, x2, x3 }
-
-define_ty! { u32x8, u32, u32, u32, u32, u32, u32, u32, u32 }
-define_impl! { u32x8, u32, 8, i32x8, x0, x1, x2, x3, x4, x5, x6, x7 }
-
-define_ty! { i32x8, i32, i32, i32, i32, i32, i32, i32, i32 }
-define_impl! { i32x8, i32, 8, i32x8, x0, x1, x2, x3, x4, x5, x6, x7 }
-
-define_ty! {
-    u16x16,
-    u16, u16, u16, u16, u16, u16, u16, u16,
-    u16, u16, u16, u16, u16, u16, u16, u16
-}
-define_impl! {
-    u16x16, u16, 16, i16x16,
-    x0, x1, x2, x3, x4, x5, x6, x7,
-    x8, x9, x10, x11, x12, x13, x14, x15
-}
-
-define_ty! {
-    i16x16,
-    i16, i16, i16, i16, i16, i16, i16, i16,
-    i16, i16, i16, i16, i16, i16, i16, i16
-}
-define_impl! {
-    i16x16, i16, 16, i16x16,
-    x0, x1, x2, x3, x4, x5, x6, x7,
-    x8, x9, x10, x11, x12, x13, x14, x15
-}
-
-define_ty! {
-    u8x32,
-    u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8,
-    u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8
-}
-define_impl! {
-    u8x32, u8, 32, i8x32,
-    x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15,
-    x16, x17, x18, x19, x20, x21, x22, x23,
-    x24, x25, x26, x27, x28, x29, x30, x31
-}
-
-define_ty! {
-    i8x32,
-    i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8,
-    i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8
-}
-define_impl! {
-    i8x32, i8, 32, i8x32,
-    x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15,
-    x16, x17, x18, x19, x20, x21, x22, x23,
-    x24, x25, x26, x27, x28, x29, x30, x31
-}
-
-define_from!(u64x4, i64x4, u32x8, i32x8, u16x16, i16x16, u8x32, i8x32);
-define_from!(i64x4, u64x4, u32x8, i32x8, u16x16, i16x16, u8x32, i8x32);
-define_from!(u32x8, u64x4, i64x4, i32x8, u16x16, i16x16, u8x32, i8x32);
-define_from!(i32x8, u64x4, i64x4, u32x8, u16x16, i16x16, u8x32, i8x32);
-define_from!(u16x16, u64x4, i64x4, u32x8, i32x8, i16x16, u8x32, i8x32);
-define_from!(i16x16, u64x4, i64x4, u32x8, i32x8, u16x16, u8x32, i8x32);
-define_from!(u8x32, u64x4, i64x4, u32x8, i32x8, u16x16, i16x16, i8x32);
-define_from!(i8x32, u64x4, i64x4, u32x8, i32x8, u16x16, i16x16, u8x32);
-
-define_common_ops!(
-    f64x4,
-    f32x8,
-    u64x4,
-    i64x4,
-    u32x8,
-    i32x8,
-    u16x16,
-    i16x16,
-    u8x32,
-    i8x32
-);
-define_float_ops!(f64x4, f32x8);
-define_integer_ops!(
-    (u64x4, u64),
-    (i64x4, i64),
-    (u32x8, u32),
-    (i32x8, i32),
-    (u16x16, u16),
-    (i16x16, i16),
-    (u8x32, u8),
-    (i8x32, i8)
-);
-define_signed_integer_ops!(i64x4, i32x8, i16x16, i8x32);
-define_casts!(
-    (f64x4, f32x4, as_f32x4),
-    (f64x4, u64x4, as_u64x4),
-    (f64x4, i64x4, as_i64x4),
-    (f32x8, u32x8, as_u32x8),
-    (f32x8, i32x8, as_i32x8),
-    (u64x4, f64x4, as_f64x4),
-    (u64x4, i64x4, as_i64x4),
-    (i64x4, f64x4, as_f64x4),
-    (i64x4, u64x4, as_u64x4),
-    (u32x8, f32x8, as_f32x8),
-    (u32x8, i32x8, as_i32x8),
-    (i32x8, f32x8, as_f32x8),
-    (i32x8, u32x8, as_u32x8),
-    (u16x16, i16x16, as_i16x16),
-    (i16x16, u16x16, as_u16x16),
-    (u8x32, i8x32, as_i8x32),
-    (i8x32, u8x32, as_u8x32)
-);
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn operators() {
-        test_ops_si!(i8x32, i16x16, i32x8, i64x4);
-        test_ops_ui!(u8x32, u16x16, u32x8, u64x4);
-        test_ops_f!(f32x8, f64x4);
-    }
-}
diff --git a/coresimd/v512.rs b/coresimd/v512.rs
deleted file mode 100644
index 351bef9f02..0000000000
--- a/coresimd/v512.rs
+++ /dev/null
@@ -1,180 +0,0 @@
-//! 512-bit wide vector types
-
-use prelude::v1::*;
-
-use coresimd::simd_llvm::*;
-
-define_ty! { f64x8, f64, f64, f64, f64, f64, f64, f64, f64 }
-define_impl! { f64x8, f64, 8, i64x8, x0, x1, x2, x3, x4, x5, x6, x7 }
-
-define_ty! {
-    f32x16,
-    f32, f32, f32, f32, f32, f32, f32, f32,
-    f32, f32, f32, f32, f32, f32, f32, f32
-}
-define_impl! {
-    f32x16, f32, 16, i32x16,
-    x0, x1, x2, x3, x4, x5, x6, x7,
-    x8, x9, x10, x11, x12, x13, x14, x15
-}
-
-define_ty! { u64x8, u64, u64, u64, u64, u64, u64, u64, u64 }
-define_impl! { u64x8, u64, 8, i64x8, x0, x1, x2, x3, x4, x5, x6, x7 }
-
-define_ty! { i64x8, i64, i64, i64, i64, i64, i64, i64, i64 }
-define_impl! { i64x8, i64, 8, i64x8, x0, x1, x2, x3, x4, x5, x6, x7 }
-
-define_ty! {
-    u32x16,
-    u32, u32, u32, u32, u32, u32, u32, u32,
-    u32, u32, u32, u32, u32, u32, u32, u32
-}
-define_impl! {
-    u32x16, u32, 16, i32x16,
-    x0, x1, x2, x3, x4, x5, x6, x7,
-    x8, x9, x10, x11, x12, x13, x14, x15
-}
-
-define_ty! {
-    i32x16,
-    i32, i32, i32, i32, i32, i32, i32, i32,
-    i32, i32, i32, i32, i32, i32, i32, i32
-}
-define_impl! {
-    i32x16, i32, 16, i32x16,
-    x0, x1, x2, x3, x4, x5, x6, x7,
-    x8, x9, x10, x11, x12, x13, x14, x15
-}
-
-define_ty! {
-    u16x32,
-    u16, u16, u16, u16, u16, u16, u16, u16,
-    u16, u16, u16, u16, u16, u16, u16, u16,
-    u16, u16, u16, u16, u16, u16, u16, u16,
-    u16, u16, u16, u16, u16, u16, u16, u16
-}
-define_impl! {
-    u16x32, u16, 32, i16x32,
-    x0, x1, x2, x3, x4, x5, x6, x7,
-    x8, x9, x10, x11, x12, x13, x14, x15,
-    x16, x17, x18, x19, x20, x21, x22, x23,
-    x24, x25, x26, x27, x28, x29, x30, x31
-}
-
-define_ty! {
-    i16x32,
-    i16, i16, i16, i16, i16, i16, i16, i16,
-    i16, i16, i16, i16, i16, i16, i16, i16,
-    i16, i16, i16, i16, i16, i16, i16, i16,
-    i16, i16, i16, i16, i16, i16, i16, i16
-}
-define_impl! {
-    i16x32, i16, 32, i16x32,
-    x0, x1, x2, x3, x4, x5, x6, x7,
-    x8, x9, x10, x11, x12, x13, x14, x15,
-    x16, x17, x18, x19, x20, x21, x22, x23,
-    x24, x25, x26, x27, x28, x29, x30, x31
-}
-
-define_ty! {
-    u8x64,
-    u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8,
-    u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8,
-    u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8,
-    u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8
-}
-define_impl! {
-    u8x64, u8, 64, i8x64,
-    x0, x1, x2, x3, x4, x5, x6, x7,
-    x8, x9, x10, x11, x12, x13, x14, x15,
-    x16, x17, x18, x19, x20, x21, x22, x23,
-    x24, x25, x26, x27, x28, x29, x30, x31,
-    x32, x33, x34, x35, x36, x37, x38, x39,
-    x40, x41, x42, x43, x44, x45, x46, x47,
-    x48, x49, x50, x51, x52, x53, x54, x55,
-    x56, x57, x58, x59, x60, x61, x62, x63
-}
-
-define_ty! {
-    i8x64,
-    i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8,
-    i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8,
-    i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8,
-    i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8
-}
-define_impl! {
-    i8x64, i8, 64, i8x64,
-    x0, x1, x2, x3, x4, x5, x6, x7,
-    x8, x9, x10, x11, x12, x13, x14, x15,
-    x16, x17, x18, x19, x20, x21, x22, x23,
-    x24, x25, x26, x27, x28, x29, x30, x31,
-    x32, x33, x34, x35, x36, x37, x38, x39,
-    x40, x41, x42, x43, x44, x45, x46, x47,
-    x48, x49, x50, x51, x52, x53, x54, x55,
-    x56, x57, x58, x59, x60, x61, x62, x63
-}
-
-define_from!(u64x8, i64x8, u32x16, i32x16, u16x32, i16x32, u8x64, i8x64);
-define_from!(i64x8, u64x8, u32x16, i32x16, u16x32, i16x32, u8x64, i8x64);
-define_from!(u32x16, u64x8, i64x8, i32x16, u16x32, i16x32, u8x64, i8x64);
-define_from!(i32x16, u64x8, i64x8, u32x16, u16x32, i16x32, u8x64, i8x64);
-define_from!(u16x32, u64x8, i64x8, u32x16, i32x16, i16x32, u8x64, i8x64);
-define_from!(i16x32, u64x8, i64x8, u32x16, i32x16, u16x32, u8x64, i8x64);
-define_from!(u8x64, u64x8, i64x8, u32x16, i32x16, u16x32, i16x32, i8x64);
-define_from!(i8x64, u64x8, i64x8, u32x16, i32x16, u16x32, i16x32, u8x64);
-
-define_common_ops!(
-    f64x8,
-    f32x16,
-    u64x8,
-    i64x8,
-    u32x16,
-    i32x16,
-    u16x32,
-    i16x32,
-    u8x64,
-    i8x64
-);
-define_float_ops!(f64x8, f32x16);
-define_integer_ops!(
-    (u64x8, u64),
-    (i64x8, i64),
-    (u32x16, u32),
-    (i32x16, i32),
-    (u16x32, u16),
-    (i16x32, i16),
-    (u8x64, u8),
-    (i8x64, i8)
-);
-define_signed_integer_ops!(i64x8, i32x16, i16x32, i8x64);
-define_casts!(
-    (f64x8, f32x8, as_f32x8),
-    (f64x8, u64x8, as_u64x8),
-    (f64x8, i64x8, as_i64x8),
-    (f32x16, u32x16, as_u32x16),
-    (f32x16, i32x16, as_i32x16),
-    (u64x8, f64x8, as_f64x8),
-    (u64x8, i64x8, as_i64x8),
-    (i64x8, f64x8, as_f64x8),
-    (i64x8, u64x8, as_u64x8),
-    (u32x16, f32x16, as_f32x16),
-    (u32x16, i32x16, as_i32x16),
-    (i32x16, f32x16, as_f32x16),
-    (i32x16, u32x16, as_u32x16),
-    (u16x32, i16x32, as_i16x32),
-    (i16x32, u16x32, as_u16x32),
-    (u8x64, i8x64, as_i8x64),
-    (i8x64, u8x64, as_u8x64)
-);
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn operators() {
-        test_ops_si!(i8x64, i16x32, i32x16, i64x8);
-        test_ops_ui!(u8x64, u16x32, u32x16, u64x8);
-        test_ops_f!(f32x16, f64x8);
-    }
-}
diff --git a/coresimd/x86/aes.rs b/coresimd/x86/aes.rs
index 6cb618343c..9ced4a2022 100644
--- a/coresimd/x86/aes.rs
+++ b/coresimd/x86/aes.rs
@@ -60,7 +60,7 @@ pub unsafe fn _mm_aesenclast_si128(a: __m128i, round_key: __m128i) -> __m128i {
     aesenclast(a, round_key)
 }
 
-/// Perform the “InvMixColumns” transformation on `a`.
+/// Perform the `InvMixColumns` transformation on `a`.
 #[inline]
 #[target_feature(enable = "aes")]
 #[cfg_attr(test, assert_instr(aesimc))]
diff --git a/coresimd/x86/avx.rs b/coresimd/x86/avx.rs
index 81cc7dff98..0686369481 100644
--- a/coresimd/x86/avx.rs
+++ b/coresimd/x86/avx.rs
@@ -14,8 +14,7 @@
 //! [wiki]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
 
 use coresimd::simd_llvm::*;
-use coresimd::v128::*;
-use coresimd::v256::*;
+use coresimd::simd::*;
 use coresimd::x86::*;
 use intrinsics;
 use mem;
diff --git a/coresimd/x86/avx2.rs b/coresimd/x86/avx2.rs
index 7cba6aa7cf..763ccf8614 100644
--- a/coresimd/x86/avx2.rs
+++ b/coresimd/x86/avx2.rs
@@ -19,10 +19,7 @@
 //! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate
 
 use coresimd::simd_llvm::*;
-use coresimd::v256::*;
-use coresimd::v128::*;
-use coresimd::v64::*;
-use coresimd::v32::*;
+use coresimd::simd::*;
 use coresimd::x86::*;
 use mem;
 
@@ -142,102 +139,144 @@ pub unsafe fn _mm256_alignr_epi8(a: __m256i, b: __m256i, n: i32) -> __m256i {
     let b = b.as_i8x32();
 
     let r: i8x32 = match n {
-        0 => {
-            simd_shuffle32(b, a, [
-                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-            ])
-        }
-        1 => {
-            simd_shuffle32(b, a, [
-                1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32,
-                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48,
-            ])
-        }
-        2 => {
-            simd_shuffle32(b, a, [
-                2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33,
-                18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49,
-            ])
-        }
-        3 => {
-            simd_shuffle32(b, a, [
-                3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34,
-                19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50,
-            ])
-        }
-        4 => {
-            simd_shuffle32(b, a, [
-                4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35,
-                20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51,
-            ])
-        }
-        5 => {
-            simd_shuffle32(b, a, [
-                5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36,
-                21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52,
-            ])
-        }
-        6 => {
-            simd_shuffle32(b, a, [
+        0 => simd_shuffle32(
+            b,
+            a,
+            [
+                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
+                18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+            ],
+        ),
+        1 => simd_shuffle32(
+            b,
+            a,
+            [
+                1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 17, 18,
+                19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48,
+            ],
+        ),
+        2 => simd_shuffle32(
+            b,
+            a,
+            [
+                2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 18,
+                19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49,
+            ],
+        ),
+        3 => simd_shuffle32(
+            b,
+            a,
+            [
+                3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 19,
+                20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50,
+            ],
+        ),
+        4 => simd_shuffle32(
+            b,
+            a,
+            [
+                4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 20,
+                21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51,
+            ],
+        ),
+        5 => simd_shuffle32(
+            b,
+            a,
+            [
+                5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 21,
+                22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52,
+            ],
+        ),
+        6 => simd_shuffle32(
+            b,
+            a,
+            [
                 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37,
-                22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53,
-            ])
-        }
-        7 => {
-            simd_shuffle32(b, a, [
+                22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52,
+                53,
+            ],
+        ),
+        7 => simd_shuffle32(
+            b,
+            a,
+            [
                 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38,
-                23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54,
-            ])
-        }
-        8 => {
-            simd_shuffle32(b, a, [
+                23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53,
+                54,
+            ],
+        ),
+        8 => simd_shuffle32(
+            b,
+            a,
+            [
                 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39,
-                24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55,
-            ])
-        }
-        9 => {
-            simd_shuffle32(b, a, [
+                24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54,
+                55,
+            ],
+        ),
+        9 => simd_shuffle32(
+            b,
+            a,
+            [
                 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40,
-                25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56,
-            ])
-        }
-        10 => {
-            simd_shuffle32(b, a, [
-                10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
-                26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
-            ])
-        }
-        11 => {
-            simd_shuffle32(b, a, [
-                11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42,
-                27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58,
-            ])
-        }
-        12 => {
-            simd_shuffle32(b, a, [
-                12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
-                28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
-            ])
-        }
-        13 => {
-            simd_shuffle32(b, a, [
-                13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
-                29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
-            ])
-        }
-        14 => {
-            simd_shuffle32(b, a, [
-                14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
-                30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
-            ])
-        }
-        15 => {
-            simd_shuffle32(b, a, [
-                15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46,
-                31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
-            ])
-        }
+                25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55,
+                56,
+            ],
+        ),
+        10 => simd_shuffle32(
+            b,
+            a,
+            [
+                10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40,
+                41, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55,
+                56, 57,
+            ],
+        ),
+        11 => simd_shuffle32(
+            b,
+            a,
+            [
+                11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
+                42, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56,
+                57, 58,
+            ],
+        ),
+        12 => simd_shuffle32(
+            b,
+            a,
+            [
+                12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42,
+                43, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
+                58, 59,
+            ],
+        ),
+        13 => simd_shuffle32(
+            b,
+            a,
+            [
+                13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
+                44, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58,
+                59, 60,
+            ],
+        ),
+        14 => simd_shuffle32(
+            b,
+            a,
+            [
+                14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
+                45, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+                60, 61,
+            ],
+        ),
+        15 => simd_shuffle32(
+            b,
+            a,
+            [
+                15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
+                46, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
+                61, 62,
+            ],
+        ),
         _ => b,
     };
     mem::transmute(r)
diff --git a/coresimd/x86/mmx.rs b/coresimd/x86/mmx.rs
index 411ce12063..a8ba18f998 100644
--- a/coresimd/x86/mmx.rs
+++ b/coresimd/x86/mmx.rs
@@ -8,7 +8,7 @@
 //!
 //! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
 
-use coresimd::v64::*;
+use coresimd::simd::*;
 use coresimd::x86::*;
 use mem;
 
diff --git a/coresimd/x86/mod.rs b/coresimd/x86/mod.rs
index 1770a719fe..9195e58be0 100644
--- a/coresimd/x86/mod.rs
+++ b/coresimd/x86/mod.rs
@@ -359,42 +359,42 @@ pub(crate) trait m128iExt: Sized {
     fn as_m128i(self) -> __m128i;
 
     #[inline]
-    fn as_u8x16(self) -> ::coresimd::v128::u8x16 {
+    fn as_u8x16(self) -> ::coresimd::simd::u8x16 {
         unsafe { mem::transmute(self.as_m128i()) }
     }
 
     #[inline]
-    fn as_u16x8(self) -> ::coresimd::v128::u16x8 {
+    fn as_u16x8(self) -> ::coresimd::simd::u16x8 {
         unsafe { mem::transmute(self.as_m128i()) }
     }
 
     #[inline]
-    fn as_u32x4(self) -> ::coresimd::v128::u32x4 {
+    fn as_u32x4(self) -> ::coresimd::simd::u32x4 {
         unsafe { mem::transmute(self.as_m128i()) }
     }
 
     #[inline]
-    fn as_u64x2(self) -> ::coresimd::v128::u64x2 {
+    fn as_u64x2(self) -> ::coresimd::simd::u64x2 {
         unsafe { mem::transmute(self.as_m128i()) }
     }
 
     #[inline]
-    fn as_i8x16(self) -> ::coresimd::v128::i8x16 {
+    fn as_i8x16(self) -> ::coresimd::simd::i8x16 {
         unsafe { mem::transmute(self.as_m128i()) }
     }
 
     #[inline]
-    fn as_i16x8(self) -> ::coresimd::v128::i16x8 {
+    fn as_i16x8(self) -> ::coresimd::simd::i16x8 {
         unsafe { mem::transmute(self.as_m128i()) }
     }
 
     #[inline]
-    fn as_i32x4(self) -> ::coresimd::v128::i32x4 {
+    fn as_i32x4(self) -> ::coresimd::simd::i32x4 {
         unsafe { mem::transmute(self.as_m128i()) }
     }
 
     #[inline]
-    fn as_i64x2(self) -> ::coresimd::v128::i64x2 {
+    fn as_i64x2(self) -> ::coresimd::simd::i64x2 {
         unsafe { mem::transmute(self.as_m128i()) }
     }
 }
@@ -412,42 +412,42 @@ pub(crate) trait m256iExt: Sized {
     fn as_m256i(self) -> __m256i;
 
     #[inline]
-    fn as_u8x32(self) -> ::coresimd::v256::u8x32 {
+    fn as_u8x32(self) -> ::coresimd::simd::u8x32 {
         unsafe { mem::transmute(self.as_m256i()) }
     }
 
     #[inline]
-    fn as_u16x16(self) -> ::coresimd::v256::u16x16 {
+    fn as_u16x16(self) -> ::coresimd::simd::u16x16 {
         unsafe { mem::transmute(self.as_m256i()) }
     }
 
     #[inline]
-    fn as_u32x8(self) -> ::coresimd::v256::u32x8 {
+    fn as_u32x8(self) -> ::coresimd::simd::u32x8 {
         unsafe { mem::transmute(self.as_m256i()) }
     }
 
     #[inline]
-    fn as_u64x4(self) -> ::coresimd::v256::u64x4 {
+    fn as_u64x4(self) -> ::coresimd::simd::u64x4 {
         unsafe { mem::transmute(self.as_m256i()) }
     }
 
     #[inline]
-    fn as_i8x32(self) -> ::coresimd::v256::i8x32 {
+    fn as_i8x32(self) -> ::coresimd::simd::i8x32 {
         unsafe { mem::transmute(self.as_m256i()) }
     }
 
     #[inline]
-    fn as_i16x16(self) -> ::coresimd::v256::i16x16 {
+    fn as_i16x16(self) -> ::coresimd::simd::i16x16 {
         unsafe { mem::transmute(self.as_m256i()) }
     }
 
     #[inline]
-    fn as_i32x8(self) -> ::coresimd::v256::i32x8 {
+    fn as_i32x8(self) -> ::coresimd::simd::i32x8 {
         unsafe { mem::transmute(self.as_m256i()) }
     }
 
     #[inline]
-    fn as_i64x4(self) -> ::coresimd::v256::i64x4 {
+    fn as_i64x4(self) -> ::coresimd::simd::i64x4 {
         unsafe { mem::transmute(self.as_m256i()) }
     }
 }
@@ -459,6 +459,99 @@ impl m256iExt for __m256i {
     }
 }
 
+use coresimd::simd::{b8x32, b8x16, b8x8, 
+                     f32x4, f32x8, f64x2, f64x4, i16x16,
+                     i16x4, i16x8, i32x2, i32x4, i32x8, i64x2, i64x4, i8x16,
+                     i8x32, i8x8, u16x16, u16x4, u16x8, u32x2, u32x4, u32x8,
+                     u64x2, u64x4, u8x16, u8x32, u8x8};
+
+impl_from_bits_!(
+    __m64: u32x2,
+    i32x2,
+    u16x4,
+    i16x4,
+    u8x8,
+    i8x8,
+    b8x8
+);
+impl_from_bits_!(
+    __m128: u64x2,
+    i64x2,
+    f64x2,
+    u32x4,
+    i32x4,
+    f32x4,
+    u16x8,
+    i16x8,
+    u8x16,
+    i8x16,
+    b8x16
+);
+impl_from_bits_!(
+    __m128i: u64x2,
+    i64x2,
+    f64x2,
+    u32x4,
+    i32x4,
+    f32x4,
+    u16x8,
+    i16x8,
+    u8x16,
+    i8x16,
+    b8x16
+);
+impl_from_bits_!(
+    __m128d: u64x2,
+    i64x2,
+    f64x2,
+    u32x4,
+    i32x4,
+    f32x4,
+    u16x8,
+    i16x8,
+    u8x16,
+    i8x16,
+    b8x16
+);
+impl_from_bits_!(
+    __m256: u64x4,
+    i64x4,
+    f64x4,
+    u32x8,
+    i32x8,
+    f32x8,
+    u16x16,
+    i16x16,
+    u8x32,
+    i8x32,
+    b8x32
+);
+impl_from_bits_!(
+    __m256i: u64x4,
+    i64x4,
+    f64x4,
+    u32x8,
+    i32x8,
+    f32x8,
+    u16x16,
+    i16x16,
+    u8x32,
+    i8x32,
+    b8x32
+);
+impl_from_bits_!(
+    __m256d: u64x4,
+    i64x4,
+    f64x4,
+    u32x8,
+    i32x8,
+    f32x8,
+    u16x16,
+    i16x16,
+    u8x32,
+    i8x32,
+    b8x32
+);
 
 mod eflags;
 pub use self::eflags::*;
diff --git a/coresimd/x86/pclmulqdq.rs b/coresimd/x86/pclmulqdq.rs
index 8c4f4b8574..b36f69cbc0 100644
--- a/coresimd/x86/pclmulqdq.rs
+++ b/coresimd/x86/pclmulqdq.rs
@@ -23,20 +23,26 @@ extern "C" {
 /// should be used. Immediate bits other than 0 and 4 are ignored.
 #[inline]
 #[target_feature(enable = "pclmulqdq")]
-#[cfg_attr(all(test, not(target_os="linux")), assert_instr(pclmulqdq, imm8 = 0))]
-#[cfg_attr(all(test, target_os="linux"), assert_instr(pclmullqlqdq, imm8 = 0))]
-#[cfg_attr(all(test, target_os="linux"), assert_instr(pclmulhqlqdq, imm8 = 1))]
-#[cfg_attr(all(test, target_os="linux"), assert_instr(pclmullqhqdq, imm8 = 16))]
-#[cfg_attr(all(test, target_os="linux"), assert_instr(pclmulhqhqdq, imm8 = 17))]
+#[cfg_attr(all(test, not(target_os = "linux")),
+           assert_instr(pclmulqdq, imm8 = 0))]
+#[cfg_attr(all(test, target_os = "linux"),
+           assert_instr(pclmullqlqdq, imm8 = 0))]
+#[cfg_attr(all(test, target_os = "linux"),
+           assert_instr(pclmulhqlqdq, imm8 = 1))]
+#[cfg_attr(all(test, target_os = "linux"),
+           assert_instr(pclmullqhqdq, imm8 = 16))]
+#[cfg_attr(all(test, target_os = "linux"),
+           assert_instr(pclmulhqhqdq, imm8 = 17))]
 #[rustc_args_required_const(2)]
-pub unsafe fn _mm_clmulepi64_si128(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
+pub unsafe fn _mm_clmulepi64_si128(
+    a: __m128i, b: __m128i, imm8: i32
+) -> __m128i {
     macro_rules! call {
         ($imm8:expr) => (pclmulqdq(a, b, $imm8))
     }
     constify_imm8!(imm8, call)
 }
 
-
 #[cfg(test)]
 mod tests {
     // The constants in the tests below are just bit patterns. They should not
diff --git a/coresimd/x86/rdrand.rs b/coresimd/x86/rdrand.rs
index e15da28e0e..9877125851 100644
--- a/coresimd/x86/rdrand.rs
+++ b/coresimd/x86/rdrand.rs
@@ -1,6 +1,6 @@
 //! RDRAND and RDSEED instructions for returning random numbers from an Intel
-//! on-chip hardware random number generator which has been seeded by an on-chip
-//! entropy source.
+//! on-chip hardware random number generator which has been seeded by an
+//! on-chip entropy source.
 
 extern "platform-intrinsic" {
     fn x86_rdrand16_step() -> (u16, i32);
@@ -17,6 +17,7 @@ use stdsimd_test::assert_instr;
 #[inline]
 #[target_feature(enable = "rdrand")]
 #[cfg_attr(test, assert_instr(rdrand))]
+#[cfg_attr(feature = "cargo-clippy", allow(stutter))]
 pub unsafe fn _rdrand16_step(val: &mut u16) -> i32 {
     let (v, flag) = x86_rdrand16_step();
     *val = v;
@@ -28,6 +29,7 @@ pub unsafe fn _rdrand16_step(val: &mut u16) -> i32 {
 #[inline]
 #[target_feature(enable = "rdrand")]
 #[cfg_attr(test, assert_instr(rdrand))]
+#[cfg_attr(feature = "cargo-clippy", allow(stutter))]
 pub unsafe fn _rdrand32_step(val: &mut u32) -> i32 {
     let (v, flag) = x86_rdrand32_step();
     *val = v;
diff --git a/coresimd/x86/sse.rs b/coresimd/x86/sse.rs
index 1d871ab76f..6b5157149f 100644
--- a/coresimd/x86/sse.rs
+++ b/coresimd/x86/sse.rs
@@ -1,8 +1,7 @@
 //! Streaming SIMD Extensions (SSE)
 
 use coresimd::simd_llvm::*;
-use coresimd::v128::*;
-use coresimd::v64::*;
+use coresimd::simd::*;
 use coresimd::x86::*;
 use intrinsics;
 use mem;
@@ -2167,12 +2166,10 @@ pub unsafe fn _mm_cvtps_pi8(a: __m128) -> __m64 {
 mod tests {
     use std::mem::transmute;
     use std::f32::NAN;
-
     use stdsimd_test::simd_test;
     use test::black_box; // Used to inhibit constant-folding.
 
-    use coresimd::v128::*;
-    use coresimd::v64::*;
+    use coresimd::simd::*;
     use coresimd::x86::*;
 
     #[simd_test = "sse"]
diff --git a/coresimd/x86/sse2.rs b/coresimd/x86/sse2.rs
index dcfb149ee0..b16b920b59 100644
--- a/coresimd/x86/sse2.rs
+++ b/coresimd/x86/sse2.rs
@@ -4,8 +4,7 @@
 use stdsimd_test::assert_instr;
 
 use coresimd::simd_llvm::*;
-use coresimd::v128::*;
-use coresimd::v64::*;
+use coresimd::simd::*;
 use coresimd::x86::*;
 use intrinsics;
 use mem;
@@ -319,6 +318,8 @@ pub unsafe fn _mm_slli_si128(a: __m128i, imm8: i32) -> __m128i {
     _mm_slli_si128_impl(a, imm8)
 }
 
+/// Implementation detail: converts the immediate argument of the
+/// `_mm_slli_si128` intrinsic into a compile-time constant.
 #[inline]
 #[target_feature(enable = "sse2")]
 unsafe fn _mm_slli_si128_impl(a: __m128i, imm8: i32) -> __m128i {
@@ -479,6 +480,8 @@ pub unsafe fn _mm_srli_si128(a: __m128i, imm8: i32) -> __m128i {
     _mm_srli_si128_impl(a, imm8)
 }
 
+/// Implementation detail: converts the immediate argument of the
+/// `_mm_srli_si128` intrinsic into a compile-time constant.
 #[inline]
 #[target_feature(enable = "sse2")]
 unsafe fn _mm_srli_si128_impl(a: __m128i, imm8: i32) -> __m128i {
@@ -2502,7 +2505,7 @@ mod tests {
     use stdsimd_test::simd_test;
     use test::black_box; // Used to inhibit constant-folding.
     use coresimd::x86::*;
-    use coresimd::v128::*;
+    use coresimd::simd::*;
 
     #[simd_test = "sse2"]
     unsafe fn test_mm_pause() {
diff --git a/coresimd/x86/sse3.rs b/coresimd/x86/sse3.rs
index b380cbe310..6272502996 100644
--- a/coresimd/x86/sse3.rs
+++ b/coresimd/x86/sse3.rs
@@ -1,7 +1,7 @@
 //! Streaming SIMD Extensions 3 (SSE3)
 
 use coresimd::simd_llvm::{simd_shuffle2, simd_shuffle4};
-use coresimd::v128::*;
+use coresimd::simd::*;
 use coresimd::x86::*;
 
 #[cfg(test)]
diff --git a/coresimd/x86/sse41.rs b/coresimd/x86/sse41.rs
index 84270c5278..e07ffd47e4 100644
--- a/coresimd/x86/sse41.rs
+++ b/coresimd/x86/sse41.rs
@@ -1,10 +1,7 @@
 //! Streaming SIMD Extensions 4.1 (SSE4.1)
 
 use coresimd::simd_llvm::*;
-use coresimd::v128::*;
-use coresimd::v64::*;
-use coresimd::v32::*;
-use coresimd::v16::*;
+use coresimd::simd::*;
 use coresimd::x86::*;
 use mem;
 
diff --git a/coresimd/x86/sse42.rs b/coresimd/x86/sse42.rs
index 75456858fe..7dd7abb1c4 100644
--- a/coresimd/x86/sse42.rs
+++ b/coresimd/x86/sse42.rs
@@ -6,7 +6,7 @@
 use stdsimd_test::assert_instr;
 
 use coresimd::simd_llvm::*;
-use coresimd::v128::*;
+use coresimd::simd::*;
 use coresimd::x86::*;
 
 /// String contains unsigned 8-bit characters *(Default)*
diff --git a/coresimd/x86/sse4a.rs b/coresimd/x86/sse4a.rs
index 5db910004f..0aae55502a 100644
--- a/coresimd/x86/sse4a.rs
+++ b/coresimd/x86/sse4a.rs
@@ -1,6 +1,6 @@
 //! `i686`'s Streaming SIMD Extensions 4a (`SSE4a`)
 
-use coresimd::v128::*;
+use coresimd::simd::*;
 use coresimd::x86::*;
 use mem;
 
diff --git a/coresimd/x86/ssse3.rs b/coresimd/x86/ssse3.rs
index 7b97443fb5..c15ad76613 100644
--- a/coresimd/x86/ssse3.rs
+++ b/coresimd/x86/ssse3.rs
@@ -1,7 +1,7 @@
 //! Supplemental Streaming SIMD Extensions 3 (SSSE3)
 
 use coresimd::simd_llvm::simd_shuffle16;
-use coresimd::v128::*;
+use coresimd::simd::*;
 use coresimd::x86::*;
 use mem;
 
diff --git a/coresimd/x86/test.rs b/coresimd/x86/test.rs
index e03f3c413c..1b5b6b1fb0 100644
--- a/coresimd/x86/test.rs
+++ b/coresimd/x86/test.rs
@@ -134,6 +134,6 @@ mod x86_polyfill {
 }
 #[cfg(target_arch = "x86_64")]
 mod x86_polyfill {
-    pub use coresimd::x86_64::{_mm_insert_epi64, _mm256_insert_epi64};
+    pub use coresimd::x86_64::{_mm256_insert_epi64, _mm_insert_epi64};
 }
 pub use self::x86_polyfill::*;
diff --git a/coresimd/x86_64/rdrand.rs b/coresimd/x86_64/rdrand.rs
index b3311c2b96..917e900fef 100644
--- a/coresimd/x86_64/rdrand.rs
+++ b/coresimd/x86_64/rdrand.rs
@@ -1,6 +1,6 @@
 //! RDRAND and RDSEED instructions for returning random numbers from an Intel
-//! on-chip hardware random number generator which has been seeded by an on-chip
-//! entropy source.
+//! on-chip hardware random number generator which has been seeded by an
+//! on-chip entropy source.
 
 extern "platform-intrinsic" {
     fn x86_rdrand64_step() -> (u64, i32);
@@ -15,6 +15,7 @@ use stdsimd_test::assert_instr;
 #[inline]
 #[target_feature(enable = "rdrand")]
 #[cfg_attr(test, assert_instr(rdrand))]
+#[cfg_attr(feature = "cargo-clippy", allow(stutter))]
 pub unsafe fn _rdrand64_step(val: &mut u64) -> i32 {
     let (v, flag) = x86_rdrand64_step();
     *val = v;
diff --git a/crates/coresimd/src/lib.rs b/crates/coresimd/src/lib.rs
index becfc7f824..2400ccc082 100644
--- a/crates/coresimd/src/lib.rs
+++ b/crates/coresimd/src/lib.rs
@@ -1,7 +1,7 @@
 //! SIMD and vendor intrinsics support library.
 //!
-//! This documentation is for the `coresimd` crate, but you probably want to use
-//! the [`stdsimd` crate][stdsimd] which should have more complete
+//! This documentation is for the `coresimd` crate, but you probably want to
+//! use the [`stdsimd` crate][stdsimd] which should have more complete
 //! documentation.
 //!
 //! [stdsimd]: https://rust-lang-nursery.github.io/stdsimd/x86_64/stdsimd/
@@ -13,8 +13,10 @@
            simd_ffi, target_feature, cfg_target_feature, i128_type, asm,
            integer_atomics, stmt_expr_attributes, core_intrinsics,
            crate_in_paths, no_core, attr_literals, rustc_attrs, stdsimd,
-           staged_api)]
-#![cfg_attr(test, feature(proc_macro, test, attr_literals, abi_vectorcall))]
+           staged_api, fn_must_use, core_float, core_slice_ext, align_offset)]
+#![cfg_attr(test,
+            feature(proc_macro, test, attr_literals, abi_vectorcall,
+                    untagged_unions))]
 #![cfg_attr(feature = "cargo-clippy",
             allow(inline_always, too_many_arguments, cast_sign_loss,
                   cast_lossless, cast_possible_wrap,
@@ -25,7 +27,8 @@
 #![no_core]
 #![unstable(feature = "stdsimd", issue = "0")]
 #![doc(test(attr(deny(warnings))),
-       test(attr(allow(dead_code, deprecated, unused_variables, unused_mut))))]
+       test(attr(allow(dead_code, deprecated, unused_variables,
+                       unused_mut))))]
 
 #[cfg_attr(not(test), macro_use)]
 extern crate core as _core;
@@ -33,12 +36,12 @@ extern crate core as _core;
 #[macro_use]
 extern crate std;
 #[cfg(test)]
+#[macro_use]
+extern crate stdsimd;
+#[cfg(test)]
 extern crate stdsimd_test;
 #[cfg(test)]
 extern crate test;
-#[cfg(test)]
-#[macro_use]
-extern crate stdsimd;
 
 #[path = "../../../coresimd/mod.rs"]
 mod coresimd;
@@ -53,8 +56,12 @@ use _core::cmp;
 #[allow(unused_imports)]
 use _core::convert;
 #[allow(unused_imports)]
+use _core::default;
+#[allow(unused_imports)]
 use _core::fmt;
 #[allow(unused_imports)]
+use _core::hash;
+#[allow(unused_imports)]
 use _core::intrinsics;
 #[allow(unused_imports)]
 use _core::iter;
@@ -63,6 +70,8 @@ use _core::marker;
 #[allow(unused_imports)]
 use _core::mem;
 #[allow(unused_imports)]
+use _core::num;
+#[allow(unused_imports)]
 use _core::ops;
 #[allow(unused_imports)]
 use _core::option;
@@ -72,3 +81,5 @@ use _core::prelude;
 use _core::ptr;
 #[allow(unused_imports)]
 use _core::result;
+#[allow(unused_imports)]
+use _core::slice;
diff --git a/crates/coresimd/tests/cpu-detection.rs b/crates/coresimd/tests/cpu-detection.rs
index 3cd7c580bf..eeb5a4b7b2 100644
--- a/crates/coresimd/tests/cpu-detection.rs
+++ b/crates/coresimd/tests/cpu-detection.rs
@@ -26,8 +26,14 @@ fn x86_all() {
     println!("avx512bw {:?}", is_target_feature_detected!("avx512bw"));
     println!("avx512dq {:?}", is_target_feature_detected!("avx512dq"));
     println!("avx512vl {:?}", is_target_feature_detected!("avx512vl"));
-    println!("avx512_ifma {:?}", is_target_feature_detected!("avx512ifma"));
-    println!("avx512_vbmi {:?}", is_target_feature_detected!("avx512vbmi"));
+    println!(
+        "avx512_ifma {:?}",
+        is_target_feature_detected!("avx512ifma")
+    );
+    println!(
+        "avx512_vbmi {:?}",
+        is_target_feature_detected!("avx512vbmi")
+    );
     println!(
         "avx512_vpopcntdq {:?}",
         is_target_feature_detected!("avx512vpopcntdq")
diff --git a/crates/stdsimd-verify/src/lib.rs b/crates/stdsimd-verify/src/lib.rs
index ddfa7adab9..1fad214406 100644
--- a/crates/stdsimd-verify/src/lib.rs
+++ b/crates/stdsimd-verify/src/lib.rs
@@ -241,7 +241,8 @@ fn find_target_feature(attrs: &[syn::Attribute]) -> Option<syn::Lit> {
 }
 
 fn find_required_const(attrs: &[syn::Attribute]) -> Vec<usize> {
-    attrs.iter()
+    attrs
+        .iter()
         .filter(|a| a.path.segments[0].ident == "rustc_args_required_const")
         .map(|a| a.tts.clone())
         .map(|a| syn::parse::<RustcArgsRequiredConst>(a.into()).unwrap())
diff --git a/crates/stdsimd-verify/tests/x86-intel.rs b/crates/stdsimd-verify/tests/x86-intel.rs
index a709108547..bc9ef5f948 100644
--- a/crates/stdsimd-verify/tests/x86-intel.rs
+++ b/crates/stdsimd-verify/tests/x86-intel.rs
@@ -312,7 +312,9 @@ fn matches(rust: &Function, intel: &Intrinsic) -> Result<(), String> {
         if rust.arguments.len() != intel.parameters.len() {
             bail!("wrong number of arguments on {}", rust.name)
         }
-        for (i, (a, b)) in intel.parameters.iter().zip(rust.arguments).enumerate() {
+        for (i, (a, b)) in
+            intel.parameters.iter().zip(rust.arguments).enumerate()
+        {
             let is_const = rust.required_const.contains(&i);
             equate(b, &a.type_, &intel.name, is_const)?;
         }
@@ -353,15 +355,14 @@ fn matches(rust: &Function, intel: &Intrinsic) -> Result<(), String> {
     Ok(())
 }
 
-fn equate(t: &Type,
-          intel: &str,
-          intrinsic: &str,
-          is_const: bool) -> Result<(), String> {
+fn equate(
+    t: &Type, intel: &str, intrinsic: &str, is_const: bool
+) -> Result<(), String> {
     let intel = intel.replace(" *", "*");
     let intel = intel.replace(" const*", "*");
     let require_const = || {
         if is_const {
-            return Ok(())
+            return Ok(());
         }
         Err(format!("argument required to be const but isn't"))
     };
diff --git a/crates/stdsimd/src/lib.rs b/crates/stdsimd/src/lib.rs
index 16748a3ba4..0c153b35b9 100644
--- a/crates/stdsimd/src/lib.rs
+++ b/crates/stdsimd/src/lib.rs
@@ -8,15 +8,16 @@
 //! [stdsimd]: https://rust-lang-nursery.github.io/stdsimd/x86_64/stdsimd/
 
 #![feature(const_fn, integer_atomics, staged_api, stdsimd)]
+#![cfg_attr(feature = "cargo-clippy", allow(shadow_reuse))]
 #![cfg_attr(target_os = "linux", feature(linkage))]
 #![no_std]
 #![unstable(feature = "stdsimd", issue = "0")]
 
-extern crate std as _std;
-extern crate coresimd;
-extern crate libc;
 #[macro_use]
 extern crate cfg_if;
+extern crate coresimd;
+extern crate libc;
+extern crate std as _std;
 
 #[cfg(test)]
 #[macro_use]
diff --git a/crates/stdsimd/tests/cpu-detection.rs b/crates/stdsimd/tests/cpu-detection.rs
index 97fa930872..8596536f17 100644
--- a/crates/stdsimd/tests/cpu-detection.rs
+++ b/crates/stdsimd/tests/cpu-detection.rs
@@ -59,8 +59,14 @@ fn x86_all() {
     println!("avx512bw {:?}", is_target_feature_detected!("avx512bw"));
     println!("avx512dq {:?}", is_target_feature_detected!("avx512dq"));
     println!("avx512vl {:?}", is_target_feature_detected!("avx512vl"));
-    println!("avx512_ifma {:?}", is_target_feature_detected!("avx512ifma"));
-    println!("avx512_vbmi {:?}", is_target_feature_detected!("avx512vbmi"));
+    println!(
+        "avx512_ifma {:?}",
+        is_target_feature_detected!("avx512ifma")
+    );
+    println!(
+        "avx512_vbmi {:?}",
+        is_target_feature_detected!("avx512vbmi")
+    );
     println!(
         "avx512_vpopcntdq {:?}",
         is_target_feature_detected!("avx512vpopcntdq")
diff --git a/examples/hex.rs b/examples/hex.rs
index 6a0d4c1135..fa0cc2685f 100644
--- a/examples/hex.rs
+++ b/examples/hex.rs
@@ -14,10 +14,18 @@
 
 #![feature(cfg_target_feature, target_feature, stdsimd)]
 #![cfg_attr(test, feature(test))]
+#![cfg_attr(feature = "cargo-clippy",
+            allow(result_unwrap_used, print_stdout, option_unwrap_used,
+                  shadow_reuse, cast_possible_wrap, cast_sign_loss,
+                  missing_docs_in_private_items))]
 
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 #[macro_use]
 extern crate stdsimd;
 
+#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
+extern crate stdsimd;
+
 #[cfg(test)]
 #[macro_use]
 extern crate quickcheck;
diff --git a/examples/nbody.rs b/examples/nbody.rs
index a8e1927888..4abe850d8e 100644
--- a/examples/nbody.rs
+++ b/examples/nbody.rs
@@ -11,8 +11,7 @@
                   shadow_reuse, print_stdout))]
 
 extern crate stdsimd;
-use self::stdsimd::simd;
-use simd::f64x2;
+use stdsimd::simd::*;
 
 const PI: f64 = std::f64::consts::PI;
 const SOLAR_MASS: f64 = 4.0 * PI * PI;
@@ -31,17 +30,16 @@ impl Frsqrt for f64x2 {
             use stdsimd::arch::x86::*;
             #[cfg(target_arch = "x86_64")]
             use stdsimd::arch::x86_64::*;
+            let t: f32x2 = (*self).into();
 
-            let t = self.as_f32x2();
-
-            let u = unsafe {
+            let u: f64x4 = unsafe {
                 let res = _mm_rsqrt_ps(_mm_setr_ps(
                     t.extract(0),
                     t.extract(1),
                     0.,
                     0.,
                 ));
-                std::mem::transmute::<_, simd::f32x4>(res).as_f64x4()
+                f32x4::from_bits(res).into()
             };
             Self::new(u.extract(0), u.extract(1))
         }
@@ -53,7 +51,7 @@ impl Frsqrt for f64x2 {
             #[cfg(target_arch = "aarch64")]
             use stdsimd::arch::aarch64::*;
 
-            unsafe { vrsqrte_f32(self.as_f32x2()).as_f64x2() }
+            unsafe { vrsqrte_f32((*self).into()).into() }
         }
         #[cfg(not(any(all(any(target_arch = "x86",
                               target_arch = "x86_64"),
@@ -84,7 +82,7 @@ impl Body {
             x: [x0, x1, x2],
             _fill: 0.0,
             v: [v0, v1, v2],
-            mass: mass,
+            mass,
         }
     }
 }
@@ -133,7 +131,7 @@ fn advance(bodies: &mut [Body; N_BODIES], dt: f64) {
                     * (distance * distance)
         }
         dmag = f64x2::splat(dt) / dsquared * distance;
-        dmag.store(&mut mag, i);
+        dmag.store_unaligned(&mut mag[i..]);
 
         i += 2;
     }
diff --git a/stdsimd/arch/detect/cache.rs b/stdsimd/arch/detect/cache.rs
index ae9fe64f8c..1473e4fc27 100644
--- a/stdsimd/arch/detect/cache.rs
+++ b/stdsimd/arch/detect/cache.rs
@@ -23,6 +23,7 @@ pub const fn test_bit(x: u64, bit: u32) -> bool {
 const CACHE_CAPACITY: u32 = 63;
 
 /// This type is used to initialize the cache
+#[derive(Copy, Clone)]
 pub struct Initializer(u64);
 
 impl Default for Initializer {
diff --git a/stdsimd/arch/detect/linux.rs b/stdsimd/arch/detect/linux.rs
index c331d8a5dc..71373667e2 100644
--- a/stdsimd/arch/detect/linux.rs
+++ b/stdsimd/arch/detect/linux.rs
@@ -1,4 +1,3 @@
-
 #![allow(dead_code)]
 
 use core::mem;
@@ -43,9 +42,10 @@ pub struct AuxVec {
 ///
 /// [auxvec_h]: https://github.com/torvalds/linux/blob/master/include/uapi/linux/auxvec.h
 /// [auxv_docs]: https://docs.rs/auxv/0.3.3/auxv/
+#[cfg_attr(feature = "cargo-clippy", allow(items_after_statements))]
 pub fn auxv() -> Result<AuxVec, ()> {
     if !cfg!(target_os = "linux") {
-        return Err(())
+        return Err(());
     }
     if let Ok(hwcap) = getauxval(AT_HWCAP) {
         #[cfg(target_arch = "aarch64")]
@@ -78,7 +78,10 @@ pub fn auxv() -> Result<AuxVec, ()> {
         pub type F = unsafe extern "C" fn(usize) -> usize;
 
         unsafe {
-            let ptr = libc::dlsym(libc::RTLD_DEFAULT, "getauxval\0".as_ptr() as *const _);
+            let ptr = libc::dlsym(
+                libc::RTLD_DEFAULT,
+                "getauxval\0".as_ptr() as *const _,
+            );
             if ptr.is_null() {
                 return Err(());
             }
@@ -97,7 +100,7 @@ fn auxv_from_file(file: &str) -> Result<AuxVec, ()> {
     // The auxiliary vector contains at most 32 (key,value) fields: from
     // `AT_EXECFN = 31` to `AT_NULL = 0`. That is, a buffer of
     // 2*32 `usize` elements is enough to read the whole vector.
-    let mut buf = [0usize; 64];
+    let mut buf = [0_usize; 64];
     {
         let raw: &mut [u8; 64 * mem::size_of::<usize>()] =
             unsafe { mem::transmute(&mut buf) };
@@ -128,15 +131,12 @@ fn auxv_from_buf(buf: &[usize; 64]) -> Result<AuxVec, ()> {
                 _ => (),
             }
         }
-        if hwcap.is_some() && hwcap2.is_some() {
-            return Ok(AuxVec {
-                hwcap: hwcap.unwrap(),
-                hwcap2: hwcap2.unwrap(),
-            });
+
+        if let (Some(hwcap), Some(hwcap2)) = (hwcap, hwcap2) {
+            return Ok(AuxVec { hwcap, hwcap2 });
         }
     }
 
-    drop(buf);
     Err(())
 }
 
@@ -147,9 +147,9 @@ pub struct CpuInfo {
 
 impl CpuInfo {
     /// Reads /proc/cpuinfo into CpuInfo.
-    pub fn new() -> Result<CpuInfo, io::Error> {
+    pub fn new() -> Result<Self, io::Error> {
         let mut file = File::open("/proc/cpuinfo")?;
-        let mut cpui = CpuInfo { raw: String::new() };
+        let mut cpui = Self { raw: String::new() };
         file.read_to_string(&mut cpui.raw)?;
         Ok(cpui)
     }
@@ -157,7 +157,7 @@ impl CpuInfo {
     pub fn field(&self, field: &str) -> CpuInfoField {
         for l in self.raw.lines() {
             if l.trim().starts_with(field) {
-                return CpuInfoField::new(l.split(": ").skip(1).next());
+                return CpuInfoField::new(l.split(": ").nth(1));
             }
         }
         CpuInfoField(None)
@@ -170,8 +170,8 @@ impl CpuInfo {
     }
 
     #[cfg(test)]
-    fn from_str(other: &str) -> Result<CpuInfo, ::std::io::Error> {
-        Ok(CpuInfo {
+    fn from_str(other: &str) -> Result<Self, ::std::io::Error> {
+        Ok(Self {
             raw: String::from(other),
         })
     }
@@ -184,7 +184,7 @@ pub struct CpuInfoField<'a>(Option<&'a str>);
 impl<'a> PartialEq<&'a str> for CpuInfoField<'a> {
     fn eq(&self, other: &&'a str) -> bool {
         match self.0 {
-            None => other.len() == 0,
+            None => other.is_empty(),
             Some(f) => f == other.trim(),
         }
     }
@@ -205,10 +205,10 @@ impl<'a> CpuInfoField<'a> {
     /// Does the field contain `other`?
     pub fn has(&self, other: &str) -> bool {
         match self.0 {
-            None => other.len() == 0,
+            None => other.is_empty(),
             Some(f) => {
                 let other = other.trim();
-                for v in f.split(" ") {
+                for v in f.split(' ') {
                     if v == other {
                         return true;
                     }
@@ -259,10 +259,10 @@ mod tests {
 
     #[test]
     fn auxv_crate() {
-        if cfg!(target_arch = "x86") ||
-            cfg!(target_arch = "x86_64") ||
-            cfg!(target_arch = "powerpc") {
-            return
+        if cfg!(target_arch = "x86") || cfg!(target_arch = "x86_64")
+            || cfg!(target_arch = "powerpc")
+        {
+            return;
         }
         let v = auxv();
         if let Some(hwcap) = auxv_crate_getauxval(AT_HWCAP) {
@@ -280,8 +280,9 @@ mod tests {
     #[cfg(target_arch = "arm")]
     #[test]
     fn linux_rpi3() {
-        let v = auxv_from_file("../../stdsimd/arch/detect/test_data/linux-rpi3.auxv")
-            .unwrap();
+        let v = auxv_from_file(
+            "../../stdsimd/arch/detect/test_data/linux-rpi3.auxv",
+        ).unwrap();
         assert_eq!(v.hwcap, 4174038);
         assert_eq!(v.hwcap2, 16);
     }
@@ -320,7 +321,7 @@ mod tests {
     #[test]
     fn auxv_crate_procfs() {
         if cfg!(target_arch = "x86") || cfg!(target_arch = "x86_64") {
-            return
+            return;
         }
         let v = auxv();
         if let Some(hwcap) = auxv_crate_getprocfs(AT_HWCAP) {
diff --git a/stdsimd/arch/detect/x86.rs b/stdsimd/arch/detect/x86.rs
index a49a6c405b..7705cb9934 100644
--- a/stdsimd/arch/detect/x86.rs
+++ b/stdsimd/arch/detect/x86.rs
@@ -273,6 +273,7 @@ pub enum Feature {
 /// [wiki_cpuid]: https://en.wikipedia.org/wiki/CPUID
 /// [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
 /// [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
+#[cfg_attr(feature = "cargo-clippy", allow(similar_names))]
 pub fn detect_features() -> cache::Initializer {
     let mut value = cache::Initializer::default();
 
diff --git a/stdsimd/mod.rs b/stdsimd/mod.rs
index 11d1d02489..9b73a3dbd4 100644
--- a/stdsimd/mod.rs
+++ b/stdsimd/mod.rs
@@ -1,10 +1,12 @@
+//! `stdsimd`
+
 /// SIMD and vendor intrinsics module.
 ///
-/// This module is intended to be the gateway to architecture-specific intrinsic
-/// functions, typically related to SIMD (but not always!). Each architecture
-/// that Rust compiles to may contain a submodule here, which means that this is
-/// not a portable module! If you're writing a portable library take care when
-/// using these APIs!
+/// This module is intended to be the gateway to architecture-specific
+/// intrinsic functions, typically related to SIMD (but not always!). Each
+/// architecture that Rust compiles to may contain a submodule here, which
+/// means that this is not a portable module! If you're writing a portable
+/// library take care when using these APIs!
 ///
 /// Under this module you'll find an architecture-named module, such as
 /// `x86_64`. Each `#[cfg(target_arch)]` that Rust can compile to may have a
@@ -42,13 +44,13 @@
 ///
 /// # CPU Feature Detection
 ///
-/// In order to call these APIs in a safe fashion there's a number of mechanisms
-/// available to ensure that the correct CPU feature is available to call an
-/// intrinsic. Let's consider, for example, the `_mm256_add_epi64` intrinsics on
-/// the `x86` and `x86_64` architectures. This function requires the AVX2
-/// feature as [documented by Intel][intel-dox] so to correctly call this
-/// function we need to (a) guarantee we only call it on x86/x86_64 and (b)
-/// ensure that the CPU feature is available
+/// In order to call these APIs in a safe fashion there's a number of
+/// mechanisms available to ensure that the correct CPU feature is available
+/// to call an intrinsic. Let's consider, for example, the `_mm256_add_epi64`
+/// intrinsics on the `x86` and `x86_64` architectures. This function requires
+/// the AVX2 feature as [documented by Intel][intel-dox] so to correctly call
+/// this function we need to (a) guarantee we only call it on `x86`/`x86_64` and
+/// (b) ensure that the CPU feature is available
 ///
 /// [intel-dox]: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_epi64&expand=100
 ///
@@ -80,9 +82,9 @@
 /// `#[cfg]` to only compile the code in situations where the safety guarantees
 /// are upheld.
 ///
-/// Statically enabling a feature is typically done with the `-C target-feature`
-/// or `-C target-cpu` flags to the compiler. For example if your local CPU
-/// supports AVX2 then you can compile the above function with:
+/// Statically enabling a feature is typically done with the `-C
+/// target-feature` or `-C target-cpu` flags to the compiler. For example if
+/// your local CPU supports AVX2 then you can compile the above function with:
 ///
 /// ```sh
 /// $ RUSTFLAGS='-C target-cpu=native' cargo build
@@ -107,8 +109,8 @@
 /// sections more optimized for different CPUs.
 ///
 /// Taking our previous example from before, we're going to compile our binary
-/// *without* AVX2 support, but we'd like to enable it for just one function. We
-/// can do that in a manner like:
+/// *without* AVX2 support, but we'd like to enable it for just one function.
+/// We can do that in a manner like:
 ///
 /// ```ignore
 /// fn foo() {
@@ -141,14 +143,15 @@
 ///   the standard library, this macro will perform necessary runtime detection
 ///   to determine whether the CPU the program is running on supports the
 ///   specified feature. In this case the macro will expand to a boolean
-///   expression evaluating to whether the local CPU has the AVX2 feature or not.
+/// expression evaluating to whether the local CPU has the AVX2 feature or
+/// not.
 ///
 ///   Note that this macro, like the `arch` module, is platform-specific. The
 ///   name of the macro is the same across platforms, but the arguments to the
 ///   macro are only the features for the current platform. For example calling
 ///   `is_target_feature_detected!("avx2")` on ARM will be a compile time
 ///   error. To ensure we don't hit this error a statement level `#[cfg]` is
-///   used to only compile usage of the macro on x86/x86_64.
+///   used to only compile usage of the macro on `x86`/`x86_64`.
 ///
 /// * Next up we see our AVX2-enabled function, `foo_avx2`. This function is
 ///   decorated with the `#[target_feature]` attribute which enables a CPU
@@ -166,9 +169,9 @@
 ///
 /// # Ergonomics
 ///
-/// It's important to note that using the `arch` module is not the easiest thing
-/// in the world, so if you're curious to try it out you may want to brace
-/// yourself for some wordiness!
+/// It's important to note that using the `arch` module is not the easiest
+/// thing in the world, so if you're curious to try it out you may want to
+/// brace yourself for some wordiness!
 ///
 /// The primary purpose of this module is to enable stable crates on crates.io
 /// to build up much more ergonomic abstractions which end up using SIMD under
@@ -181,15 +184,15 @@
 /// This documentation is only for one particular architecture, you can find
 /// others at:
 ///
-/// * [x86]
-/// * [x86_64]
-/// * [arm]
-/// * [aarch64]
+/// * [`x86`]
+/// * [`x86_64`]
+/// * [`arm`]
+/// * [`aarch64`]
 ///
-/// [x86]: https://rust-lang-nursery.github.io/stdsimd/i686/stdsimd/arch/x86/index.html
-/// [x86_64]: https://rust-lang-nursery.github.io/stdsimd/x86_64/stdsimd/arch/x86_64/index.html
-/// [arm]: https://rust-lang-nursery.github.io/stdsimd/arm/stdsimd/arch/arm/index.html
-/// [aarch64]: https://rust-lang-nursery.github.io/stdsimd/aarch64/stdsimd/arch/aarch64/index.html
+/// [`x86`]: https://rust-lang-nursery.github.io/stdsimd/i686/stdsimd/arch/x86/index.html
+/// [`x86_64`]: https://rust-lang-nursery.github.io/stdsimd/x86_64/stdsimd/arch/x86_64/index.html
+/// [`arm`]: https://rust-lang-nursery.github.io/stdsimd/arm/stdsimd/arch/arm/index.html
+/// [`aarch64`]: https://rust-lang-nursery.github.io/stdsimd/aarch64/stdsimd/arch/aarch64/index.html
 ///
 /// # Examples
 ///
@@ -336,7 +339,6 @@
 ///     }
 /// }
 /// ```
-
 #[unstable(feature = "stdsimd", issue = "0")]
 pub mod arch {
     #[cfg(target_arch = "x86")]