diff --git a/TODO.md b/TODO.md index 1373340a72..48d81f7b2d 100644 --- a/TODO.md +++ b/TODO.md @@ -811,8 +811,8 @@ avx2 * [ ] `_mm_i64gather_epi64` * [ ] `_mm256_i64gather_epi64` * [ ] `_mm256_inserti128_si256` -* [ ] `_mm256_madd_epi16` -* [ ] `_mm256_maddubs_epi16` +* [x] `_mm256_madd_epi16` +* [x] `_mm256_maddubs_epi16` * [ ] `_mm_mask_i32gather_pd` * [ ] `_mm256_mask_i32gather_pd` * [ ] `_mm_mask_i32gather_ps` @@ -837,45 +837,45 @@ avx2 * [ ] `_mm256_maskstore_epi32` * [ ] `_mm_maskstore_epi64` * [ ] `_mm256_maskstore_epi64` -* [ ] `_mm256_max_epi8` -* [ ] `_mm256_max_epi16` -* [ ] `_mm256_max_epi32` -* [ ] `_mm256_max_epu8` -* [ ] `_mm256_max_epu16` -* [ ] `_mm256_max_epu32` -* [ ] `_mm256_min_epi8` -* [ ] `_mm256_min_epi16` -* [ ] `_mm256_min_epi32` -* [ ] `_mm256_min_epu8` -* [ ] `_mm256_min_epu16` -* [ ] `_mm256_min_epu32` +* [x] `_mm256_max_epi8` +* [x] `_mm256_max_epi16` +* [x] `_mm256_max_epi32` +* [x] `_mm256_max_epu8` +* [x] `_mm256_max_epu16` +* [x] `_mm256_max_epu32` +* [x] `_mm256_min_epi8` +* [x] `_mm256_min_epi16` +* [x] `_mm256_min_epi32` +* [x] `_mm256_min_epu8` +* [x] `_mm256_min_epu16` +* [x] `_mm256_min_epu32` * [ ] `_mm256_movemask_epi8` * [ ] `_mm256_mpsadbw_epu8` -* [ ] `_mm256_mul_epi32` -* [ ] `_mm256_mul_epu32` -* [ ] `_mm256_mulhi_epi16` -* [ ] `_mm256_mulhi_epu16` -* [ ] `_mm256_mulhrs_epi16` -* [ ] `_mm256_mullo_epi16` -* [ ] `_mm256_mullo_epi32` -* [ ] `_mm256_or_si256` -* [ ] `_mm256_packs_epi16` -* [ ] `_mm256_packs_epi32` -* [ ] `_mm256_packus_epi16` -* [ ] `_mm256_packus_epi32` +* [x] `_mm256_mul_epi32` +* [x] `_mm256_mul_epu32` +* [x] `_mm256_mulhi_epi16` +* [x] `_mm256_mulhi_epu16` +* [x] `_mm256_mulhrs_epi16` +* [x] `_mm256_mullo_epi16` +* [x] `_mm256_mullo_epi32` +* [x] `_mm256_or_si256` +* [x] `_mm256_packs_epi16` +* [x] `_mm256_packs_epi32` +* [x] `_mm256_packus_epi16` +* [x] `_mm256_packus_epi32` * [ ] `_mm256_permute2x128_si256` * [ ] `_mm256_permute4x64_epi64` * [ ] `_mm256_permute4x64_pd` * [ ] `_mm256_permutevar8x32_epi32` * [ ] `_mm256_permutevar8x32_ps` -* [ ] `_mm256_sad_epu8` +* [x] `_mm256_sad_epu8` * [ ] `_mm256_shuffle_epi32` * [ ] `_mm256_shuffle_epi8` * [ ] `_mm256_shufflehi_epi16` * [ ] `_mm256_shufflelo_epi16` -* [ ] `_mm256_sign_epi8` -* [ ] `_mm256_sign_epi16` -* [ ] `_mm256_sign_epi32` +* [x] `_mm256_sign_epi8` +* [x] `_mm256_sign_epi16` +* [x] `_mm256_sign_epi32` * [ ] `_mm256_slli_si256` * [ ] `_mm256_bslli_epi128` * [ ] `_mm256_sll_epi16` diff --git a/src/x86/avx2.rs b/src/x86/avx2.rs index 2840b0144f..a92f055276 100644 --- a/src/x86/avx2.rs +++ b/src/x86/avx2.rs @@ -78,6 +78,8 @@ pub fn _mm256_adds_epu16(a: u16x16, b: u16x16) -> u16x16 { unsafe { paddusw(a, b) } } +// TODO _mm256_alignr_epi8 + /// Compute the bitwise AND of 256 bits (representing integer data) /// in `a` and `b`. #[inline(always)] @@ -108,7 +110,9 @@ pub fn _mm256_avg_epu8 (a: u8x32, b: u8x32) -> u8x32 { unsafe { pavgb(a, b) } } -// TODO _mm256_alignr_epi8 + + + // TODO _mm256_blend_epi16 // TODO _mm_blend_epi32 // TODO _mm256_blend_epi32 @@ -252,6 +256,338 @@ pub fn _mm256_hsubs_epi16(a: i16x16, b: i16x16) -> i16x16 { unsafe { phsubsw(a, b) } } + +// TODO _mm_i32gather_epi32 (int const* base_addr, __m128i vindex, const int scale) +// TODO _mm_mask_i32gather_epi32 (__m128i src, int const* base_addr, __m128i vindex, __m128i mask, const int scale) +// TODO _mm256_i32gather_epi32 (int const* base_addr, __m256i vindex, const int scale) +// TODO _mm256_mask_i32gather_epi32 (__m256i src, int const* base_addr, __m256i vindex, __m256i mask, const int scale) +// TODO _mm_i32gather_epi64 (__int64 const* base_addr, __m128i vindex, const int scale) +// TODO _mm_mask_i32gather_epi64 (__m128i src, __int64 const* base_addr, __m128i vindex, __m128i mask, const int scale) +// TODO _mm256_i32gather_epi64 (__int64 const* base_addr, __m128i vindex, const int scale) +// TODO _mm256_mask_i32gather_epi64 (__m256i src, __int64 const* base_addr, __m128i vindex, __m256i mask, const int scale) +// TODO _mm_i32gather_pd (double const* base_addr, __m128i vindex, const int scale) +// TODO _mm_mask_i32gather_pd (__m128d src, double const* base_addr, __m128i vindex, __m128d mask, const int scale) +// TODO _mm256_i32gather_pd (double const* base_addr, __m128i vindex, const int scale) +// TODO _mm256_mask_i32gather_pd (__m256d src, double const* base_addr, __m128i vindex, __m256d mask, const int scale) +// TODO _mm_i32gather_ps (float const* base_addr, __m128i vindex, const int scale) +// TODO _mm_mask_i32gather_ps (__m128 src, float const* base_addr, __m128i vindex, __m128 mask, const int scale) +// TODO _mm256_i32gather_ps (float const* base_addr, __m256i vindex, const int scale) +// TODO _mm256_mask_i32gather_ps (__m256 src, float const* base_addr, __m256i vindex, __m256 mask, const int scale) +// TODO _mm_i64gather_epi32 (int const* base_addr, __m128i vindex, const int scale) +// TODO _mm_mask_i64gather_epi32 (__m128i src, int const* base_addr, __m128i vindex, __m128i mask, const int scale) +// TODO _mm256_i64gather_epi32 (int const* base_addr, __m256i vindex, const int scale) +// TODO _mm256_mask_i64gather_epi32 (__m128i src, int const* base_addr, __m256i vindex, __m128i mask, const int scale) +// TODO _mm_i64gather_epi64 (__int64 const* base_addr, __m128i vindex, const int scale) +// TODO _mm_mask_i64gather_epi64 (__m128i src, __int64 const* base_addr, __m128i vindex, __m128i mask, const int scale) +// TODO _mm256_i64gather_epi64 (__int64 const* base_addr, __m256i vindex, const int scale) +// TODO _mm256_mask_i64gather_epi64 (__m256i src, __int64 const* base_addr, __m256i vindex, __m256i mask, const int scale) +// TODO _mm_i64gather_pd (double const* base_addr, __m128i vindex, const int scale) +// TODO _mm_mask_i64gather_pd (__m128d src, double const* base_addr, __m128i vindex, __m128d mask, const int scale) +// TODO _mm256_i64gather_pd (double const* base_addr, __m256i vindex, const int scale) +// TODO _mm256_mask_i64gather_pd (__m256d src, double const* base_addr, __m256i vindex, __m256d mask, const int scale) +// TODO _mm_i64gather_ps (float const* base_addr, __m128i vindex, const int scale) +// TODO _mm_mask_i64gather_ps (__m128 src, float const* base_addr, __m128i vindex, __m128 mask, const int scale) +// TODO _mm256_i64gather_ps (float const* base_addr, __m256i vindex, const int scale) +// TODO _mm256_mask_i64gather_ps +// TODO _mm256_inserti128_si256 + +/// Multiply packed signed 16-bit integers in `a` and `b`, producing +/// intermediate signed 32-bit integers. Horizontally add adjacent pairs +/// of intermediate 32-bit integers. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_madd_epi16(a: i16x16, b: i16x16) -> i32x8 { + unsafe { pmaddwd(a, b) } +} + +/// Vertically multiply each unsigned 8-bit integer from `a` with the +/// corresponding signed 8-bit integer from `b`, producing intermediate +/// signed 16-bit integers. Horizontally add adjacent pairs of intermediate +/// signed 16-bit integers +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_maddubs_epi16(a: u8x32, b: u8x32) -> i16x16 { + unsafe { pmaddubsw(a, b) } +} + +// TODO _mm_maskload_epi32 (int const* mem_addr, __m128i mask) +// TODO _mm256_maskload_epi32 (int const* mem_addr, __m256i mask) +// TODO _mm_maskload_epi64 (__int64 const* mem_addr, __m128i mask) +// TODO _mm256_maskload_epi64 (__int64 const* mem_addr, __m256i mask) +// TODO _mm_maskstore_epi32 (int* mem_addr, __m128i mask, __m128i a) +// TODO _mm256_maskstore_epi32 (int* mem_addr, __m256i mask, __m256i a) +// TODO _mm_maskstore_epi64 (__int64* mem_addr, __m128i mask, __m128i a) +// TODO _mm256_maskstore_epi64 (__int64* mem_addr, __m256i mask, __m256i a) + +/// Compare packed 16-bit integers in `a` and `b`, and return the packed +/// maximum values. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_max_epi16(a: i16x16, b: i16x16) -> i16x16 { + unsafe { pmaxsw(a, b) } +} + +/// Compare packed 32-bit integers in `a` and `b`, and return the packed +/// maximum values. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_max_epi32(a: i32x8, b: i32x8) -> i32x8 { + unsafe { pmaxsd(a, b) } +} + +/// Compare packed 8-bit integers in `a` and `b`, and return the packed +/// maximum values. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_max_epi8(a: i8x32, b: i8x32) -> i8x32 { + unsafe { pmaxsb(a, b) } +} + +/// Compare packed unsigned 16-bit integers in `a` and `b`, and return +/// the packed maximum values. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_max_epu16(a: u16x16, b: u16x16) -> u16x16 { + unsafe { pmaxuw(a, b) } +} + +/// Compare packed unsigned 32-bit integers in `a` and `b`, and return +/// the packed maximum values. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_max_epu32(a: u32x8, b: u32x8) -> u32x8 { + unsafe { pmaxud(a, b) } +} + +/// Compare packed unsigned 8-bit integers in `a` and `b`, and return +/// the packed maximum values. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_max_epu8(a: u8x32, b: u8x32) -> u8x32 { + unsafe { pmaxub(a, b) } +} + +/// Compare packed 16-bit integers in `a` and `b`, and return the packed +/// minimum values. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_min_epi16(a: i16x16, b: i16x16) -> i16x16 { + unsafe { pminsw(a, b) } +} + +/// Compare packed 32-bit integers in `a` and `b`, and return the packed +/// minimum values. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_min_epi32(a: i32x8, b: i32x8) -> i32x8 { + unsafe { pminsd(a, b) } +} + +/// Compare packed 8-bit integers in `a` and `b`, and return the packed +/// minimum values. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_min_epi8(a: i8x32, b: i8x32) -> i8x32 { + unsafe { pminsb(a, b) } +} + +/// Compare packed unsigned 16-bit integers in `a` and `b`, and return +/// the packed minimum values. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_min_epu16(a: u16x16, b: u16x16) -> u16x16 { + unsafe { pminuw(a, b) } +} + +/// Compare packed unsigned 32-bit integers in `a` and `b`, and return +/// the packed minimum values. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_min_epu32(a: u32x8, b: u32x8) -> u32x8 { + unsafe { pminud(a, b) } +} + +/// Compare packed unsigned 8-bit integers in `a` and `b`, and return +/// the packed minimum values. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_min_epu8(a: u8x32, b: u8x32) -> u8x32 { + unsafe { pminub(a, b) } +} + +/*** The following two functions fail in debug, but work in release + +/// Create mask from the most significant bit of each 8-bit element in `a`, +/// return the result. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_movemask_epi8(a: i8x32) -> i32 { + unsafe { pmovmskb(a) } +} + +/// Compute the sum of absolute differences (SADs) of quadruplets of unsigned +/// 8-bit integers in `a` compared to those in `b`, and store the 16-bit +/// results in dst. Eight SADs are performed for each 128-bit lane using one +/// quadruplet from `b` and eight quadruplets from `a`. One quadruplet is +/// selected from `b` starting at on the offset specified in `imm8`. Eight +/// quadruplets are formed from sequential 8-bit integers selected from `a` +/// starting at the offset specified in `imm8`. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_mpsadbw_epu8(a: u8x32, b: u8x32, imm8: i32) -> u16x16 { + unsafe { mpsadbw(a, b, imm8) } +} + +***/ + +/// Multiply the low 32-bit integers from each packed 64-bit element in +/// `a` and `b` +/// +/// Return the 64-bit results. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_mul_epi32(a: i32x8, b: i32x8) -> i64x4 { + unsafe { pmuldq(a, b) } +} + +/// Multiply the low unsigned 32-bit integers from each packed 64-bit +/// element in `a` and `b` +/// +/// Return the unsigned 64-bit results. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_mul_epu32(a: u32x8, b: u32x8) -> u64x4 { + unsafe { pmuludq(a, b) } +} + +/// Multiply the packed 16-bit integers in `a` and `b`, producing +/// intermediate 32-bit integers and returning the high 16 bits of the +/// intermediate integers. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_mulhi_epi16(a: i16x16, b: i16x16) -> i16x16 { + unsafe { pmulhw(a, b) } +} + +/// Multiply the packed unsigned 16-bit integers in `a` and `b`, producing +/// intermediate 32-bit integers and returning the high 16 bits of the +/// intermediate integers. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_mulhi_epu16(a: u16x16, b: u16x16) -> u16x16 { + unsafe { pmulhuw(a, b) } +} + +/// Multiply the packed 16-bit integers in `a` and `b`, producing +/// intermediate 32-bit integers, and return the low 16 bits of the +/// intermediate integers +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_mullo_epi16(a: i16x16, b:i16x16) -> i16x16 { + a * b +} + + +/// Multiply the packed 32-bit integers in `a` and `b`, producing +/// intermediate 64-bit integers, and return the low 16 bits of the +/// intermediate integers +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_mullo_epi32(a: i32x8, b:i32x8) -> i32x8 { + a * b +} + +/// Multiply packed 16-bit integers in `a` and `b`, producing +/// intermediate signed 32-bit integers. Truncate each intermediate +/// integer to the 18 most significant bits, round by adding 1, and +/// return bits [16:1] +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_mulhrs_epi16(a: i16x16, b:i16x16) -> i16x16 { + unsafe { pmulhrsw(a, b) } +} + +/// Compute the bitwise OR of 256 bits (representing integer data) in `a` +/// and `b` +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i { + a | b +} + +/// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers +/// using signed saturation +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_packs_epi16(a: i16x16, b: i16x16) -> i8x32 { + unsafe { packsswb(a, b) } +} + +/// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers +/// using signed saturation +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_packs_epi32(a: i32x8, b: i32x8) -> i16x16 { + unsafe { packssdw(a, b) } +} + +/// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers +/// using unsigned saturation +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_packus_epi16(a: i16x16, b: i16x16) -> u8x32 { + unsafe { packuswb(a, b) } +} + +/// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers +/// using unsigned saturation +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_packus_epi32(a: i32x8, b: i32x8) -> u16x16 { + unsafe { packusdw(a, b) } +} + +// TODO _mm256_permute2x128_si256 (__m256i a, __m256i b, const int imm8) +// TODO _mm256_permute4x64_epi64 (__m256i a, const int imm8) +// TODO _mm256_permute4x64_pd (__m256d a, const int imm8) +// TODO _mm256_permutevar8x32_epi32 (__m256i a, __m256i idx) +// TODO _mm256_permutevar8x32_ps (__m256 a, __m256i idx) + +/// Compute the absolute differences of packed unsigned 8-bit integers in `a` +/// and `b`, then horizontally sum each consecutive 8 differences to +/// produce four unsigned 16-bit integers, and pack these unsigned 16-bit +/// integers in the low 16 bits of the 64-bit return value +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_sad_epu8 (a: u8x32, b: u8x32) -> u64x4 { + unsafe { psadbw(a, b) } +} + +// TODO _mm256_shuffle_epi32 (__m256i a, const int imm8) +// TODO _mm256_shuffle_epi8 (__m256i a, __m256i b) +// TODO _mm256_shufflehi_epi16 (__m256i a, const int imm8) +// TODO _mm256_shufflelo_epi16 (__m256i a, const int imm8) + +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_sign_epi16(a: i16x16, b: i16x16) -> i16x16 { + unsafe { psignw(a, b) } +} + +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_sign_epi32(a: i32x8, b: i32x8) -> i32x8 { + unsafe { psignd(a, b) } +} + +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_sign_epi8(a: i8x32, b: i8x32) -> i8x32 { + unsafe { psignb(a, b) } +} + + + #[allow(improper_ctypes)] extern "C" { #[link_name = "llvm.x86.avx2.pabs.b"] @@ -267,7 +603,7 @@ extern "C" { #[link_name = "llvm.x86.avx2.paddus.b"] fn paddusb(a: u8x32, b: u8x32) -> u8x32; #[link_name = "llvm.x86.avx2.paddus.w"] - fn paddusw(a: u16x16, b: u16x16) -> u16x16; + fn paddusw(a: u16x16, b: u16x16) -> u16x16; #[link_name = "llvm.x86.avx2.pavg.b"] fn pavgb(a: u8x32, b: u8x32) -> u8x32; #[link_name = "llvm.x86.avx2.pavg.w"] @@ -286,6 +622,65 @@ extern "C" { fn phsubd(a: i32x8, b: i32x8) -> i32x8; #[link_name = "llvm.x86.avx2.phsub.sw"] fn phsubsw(a: i16x16, b: i16x16) -> i16x16; + #[link_name = "llvm.x86.avx2.pmadd.wd"] + fn pmaddwd(a: i16x16, b: i16x16) -> i32x8; + #[link_name = "llvm.x86.avx2.pmadd.ub.sw"] + fn pmaddubsw(a: u8x32, b: u8x32) -> i16x16; + #[link_name = "llvm.x86.avx2.pmaxs.w"] + fn pmaxsw(a: i16x16, b: i16x16) -> i16x16; + #[link_name = "llvm.x86.avx2.pmaxs.d"] + fn pmaxsd(a: i32x8, b: i32x8) -> i32x8; + #[link_name = "llvm.x86.avx2.pmaxs.b"] + fn pmaxsb(a: i8x32, b: i8x32) -> i8x32; + #[link_name = "llvm.x86.avx2.pmaxu.w"] + fn pmaxuw(a: u16x16, b: u16x16) -> u16x16; + #[link_name = "llvm.x86.avx2.pmaxu.d"] + fn pmaxud(a: u32x8, b: u32x8) -> u32x8; + #[link_name = "llvm.x86.avx2.pmaxu.b"] + fn pmaxub(a: u8x32, b: u8x32) -> u8x32; + #[link_name = "llvm.x86.avx2.pmins.w"] + fn pminsw(a: i16x16, b: i16x16) -> i16x16; + #[link_name = "llvm.x86.avx2.pmins.d"] + fn pminsd(a: i32x8, b: i32x8) -> i32x8; + #[link_name = "llvm.x86.avx2.pmins.b"] + fn pminsb(a: i8x32, b: i8x32) -> i8x32; + #[link_name = "llvm.x86.avx2.pminu.w"] + fn pminuw(a: u16x16, b: u16x16) -> u16x16; + #[link_name = "llvm.x86.avx2.pminu.d"] + fn pminud(a: u32x8, b: u32x8) -> u32x8; + #[link_name = "llvm.x86.avx2.pminu.b"] + fn pminub(a: u8x32, b: u8x32) -> u8x32; + #[link_name = "llvm.x86.avx2.pmovmskb"] //fails in debug + fn pmovmskb(a: i8x32) -> i32; + #[link_name = "llvm.x86.avx2.mpsadbw"] //fails in debug + fn mpsadbw(a: u8x32, b: u8x32, imm8: i32) -> u16x16; + #[link_name = "llvm.x86.avx2.pmulhu.w"] + fn pmulhuw(a: u16x16, b: u16x16) -> u16x16; + #[link_name = "llvm.x86.avx2.pmulh.w"] + fn pmulhw(a: i16x16, b: i16x16) -> i16x16; + #[link_name = "llvm.x86.avx2.pmul.dq"] + fn pmuldq(a: i32x8, b:i32x8) -> i64x4; + #[link_name = "llvm.x86.avx2.pmulu.dq"] + fn pmuludq(a: u32x8, b:u32x8) -> u64x4; + #[link_name = "llvm.x86.avx2.pmul.hr.sw"] + fn pmulhrsw(a: i16x16, b: i16x16) -> i16x16; + #[link_name = "llvm.x86.avx2.packsswb"] + fn packsswb(a: i16x16, b: i16x16) -> i8x32; + #[link_name = "llvm.x86.avx2.packssdw"] + fn packssdw(a: i32x8, b: i32x8) -> i16x16; + #[link_name = "llvm.x86.avx2.packuswb"] + fn packuswb(a: i16x16, b: i16x16) -> u8x32; + #[link_name = "llvm.x86.avx2.packusdw"] + fn packusdw(a: i32x8, b: i32x8) -> u16x16; + #[link_name = "llvm.x86.avx2.psad.bw"] + fn psadbw(a: u8x32, b: u8x32) -> u64x4; + #[link_name = "llvm.x86.avx2.psign.b"] + fn psignb(a: i8x32, b: i8x32) -> i8x32; + #[link_name = "llvm.x86.avx2.psign.w"] + fn psignw(a: i16x16, b: i16x16) -> i16x16; + #[link_name = "llvm.x86.avx2.psign.d"] + fn psignd(a: i32x8, b: i32x8) -> i32x8; + } @@ -695,4 +1090,335 @@ mod tests { assert_eq!(r, e); } + #[test] + #[target_feature = "+avx2"] + fn _mm256_madd_epi16() { + let a = i16x16::splat(2); + let b = i16x16::splat(4); + let r = avx2::_mm256_madd_epi16(a, b); + let e = i32x8::splat(16); + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_maddubs_epi16() { + let a = u8x32::splat(2); + let b = u8x32::splat(4); + let r = avx2::_mm256_maddubs_epi16(a, b); + let e = i16x16::splat(16); + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_max_epi16() { + let a = i16x16::splat(2); + let b = i16x16::splat(4); + let r = avx2::_mm256_max_epi16(a, b); + assert_eq!(r, b); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_max_epi32() { + let a = i32x8::splat(2); + let b = i32x8::splat(4); + let r = avx2::_mm256_max_epi32(a, b); + assert_eq!(r, b); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_max_epi8() { + let a = i8x32::splat(2); + let b = i8x32::splat(4); + let r = avx2::_mm256_max_epi8(a, b); + assert_eq!(r, b); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_max_epu16() { + let a = u16x16::splat(2); + let b = u16x16::splat(4); + let r = avx2::_mm256_max_epu16(a, b); + assert_eq!(r, b); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_max_epu32() { + let a = u32x8::splat(2); + let b = u32x8::splat(4); + let r = avx2::_mm256_max_epu32(a, b); + assert_eq!(r, b); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_max_epu8() { + let a = u8x32::splat(2); + let b = u8x32::splat(4); + let r = avx2::_mm256_max_epu8(a, b); + assert_eq!(r, b); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_min_epi16() { + let a = i16x16::splat(2); + let b = i16x16::splat(4); + let r = avx2::_mm256_min_epi16(a, b); + assert_eq!(r, a); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_min_epi32() { + let a = i32x8::splat(2); + let b = i32x8::splat(4); + let r = avx2::_mm256_min_epi32(a, b); + assert_eq!(r, a); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_min_epi8() { + let a = i8x32::splat(2); + let b = i8x32::splat(4); + let r = avx2::_mm256_min_epi8(a, b); + assert_eq!(r, a); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_min_epu16() { + let a = u16x16::splat(2); + let b = u16x16::splat(4); + let r = avx2::_mm256_min_epu16(a, b); + assert_eq!(r, a); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_min_epu32() { + let a = u32x8::splat(2); + let b = u32x8::splat(4); + let r = avx2::_mm256_min_epu32(a, b); + assert_eq!(r, a); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_min_epu8() { + let a = u8x32::splat(2); + let b = u8x32::splat(4); + let r = avx2::_mm256_min_epu8(a, b); + assert_eq!(r, a); + } + + +/** + // TODO this fails in debug but not release, why? + #[test] + #[target_feature ="+avx2"] + fn _mm256_movemask_epi8() { + let a = i8x32::splat(-1); + let r = avx2::_mm256_movemask_epi8(a); + let e : i32 = -1; + assert_eq!(r, e); + } + + // TODO This fails in debug but not in release, whhhy? + #[test] + #[target_feature = "+avx2"] + fn _mm256_mpsadbw_epu8() { + let a = u8x32::splat(2); + let b = u8x32::splat(4); + let r = avx2::_mm256_mpsadbw_epu8(a, b, 0); + let e = u16x16::splat(8); + assert_eq!(r, e); + } +**/ + + #[test] + #[target_feature = "+avx2"] + fn _mm256_mul_epi32() { + let a = i32x8::new(0, 0, 0, 0, 2, 2, 2, 2); + let b = i32x8::new(1, 2, 3, 4, 5, 6, 7, 8); + let r = avx2::_mm256_mul_epi32(a, b); + let e = i64x4::new(0, 0, 10, 14); + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_mul_epu32() { + let a = u32x8::new(0, 0, 0, 0, 2, 2, 2, 2); + let b = u32x8::new(1, 2, 3, 4, 5, 6, 7, 8); + let r = avx2::_mm256_mul_epu32(a, b); + let e = u64x4::new(0, 0, 10, 14); + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_mulhi_epi16() { + let a = i16x16::splat(6535); + let b = i16x16::splat(6535); + let r = avx2::_mm256_mulhi_epi16(a, b); + let e = i16x16::splat(651); + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_mulhi_epu16() { + let a = u16x16::splat(6535); + let b = u16x16::splat(6535); + let r = avx2::_mm256_mulhi_epu16(a, b); + let e = u16x16::splat(651); + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_mullo_epi16() { + let a = i16x16::splat(2); + let b = i16x16::splat(4); + let r = avx2::_mm256_mullo_epi16(a, b); + let e = i16x16::splat(8); + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_mullo_epi32() { + let a = i32x8::splat(2); + let b = i32x8::splat(4); + let r = avx2::_mm256_mullo_epi32(a, b); + let e = i32x8::splat(8); + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_mulhrs_epi16() { + let a = i16x16::splat(2); + let b = i16x16::splat(4); + let r = avx2::_mm256_mullo_epi16(a, b); + let e = i16x16::splat(8); + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_or_si256() { + let a = __m256i::splat(-1); + let b = __m256i::splat(0); + let r = avx2::_mm256_or_si256(a, b); + assert_eq!(r, a); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_packs_epi16() { + let a = i16x16::splat(2); + let b = i16x16::splat(4); + let r = avx2::_mm256_packs_epi16(a, b); + let e = i8x32::new( + 2, 2, 2, 2, 2, 2, 2, 2, + 4, 4, 4, 4, 4, 4, 4, 4, + 2, 2, 2, 2, 2, 2, 2, 2, + 4, 4, 4, 4, 4, 4, 4, 4); + + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_packs_epi32() { + let a = i32x8::splat(2); + let b = i32x8::splat(4); + let r = avx2::_mm256_packs_epi32(a, b); + let e = i16x16::new( + 2, 2, 2, 2, + 4, 4, 4, 4, + 2, 2, 2, 2, + 4, 4, 4, 4); + + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_packus_epi16() { + let a = i16x16::splat(2); + let b = i16x16::splat(4); + let r = avx2::_mm256_packus_epi16(a, b); + let e = u8x32::new( + 2, 2, 2, 2, 2, 2, 2, 2, + 4, 4, 4, 4, 4, 4, 4, 4, + 2, 2, 2, 2, 2, 2, 2, 2, + 4, 4, 4, 4, 4, 4, 4, 4); + + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_packus_epi32() { + let a = i32x8::splat(2); + let b = i32x8::splat(4); + let r = avx2::_mm256_packus_epi32(a, b); + let e = u16x16::new( + 2, 2, 2, 2, + 4, 4, 4, 4, + 2, 2, 2, 2, + 4, 4, 4, 4); + + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_sad_epu8() { + let a = u8x32::splat(2); + let b = u8x32::splat(4); + let r = avx2::_mm256_sad_epu8(a, b); + let e = u64x4::splat(16); + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_sign_epi16() { + let a = i16x16::splat(2); + let b = i16x16::splat(-1); + let r = avx2::_mm256_sign_epi16(a, b); + let e = i16x16::splat(-2); + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_sign_epi32() { + let a = i32x8::splat(2); + let b = i32x8::splat(-1); + let r = avx2::_mm256_sign_epi32(a, b); + let e = i32x8::splat(-2); + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_sign_epi8() { + let a = i8x32::splat(2); + let b = i8x32::splat(-1); + let r = avx2::_mm256_sign_epi8(a, b); + let e = i8x32::splat(-2); + assert_eq!(r, e); + } + }