From aa5a55e4966b83ba80896623515bdefec987328f Mon Sep 17 00:00:00 2001 From: sayantn Date: Mon, 28 Apr 2025 12:02:27 +0530 Subject: [PATCH 1/3] Fix CI errors due to alignment issues in msvc --- crates/core_arch/src/x86/avx.rs | 30 ++++++++-- crates/core_arch/src/x86/avx512f.rs | 90 +++++++++++++++++++++++------ crates/core_arch/src/x86/sse.rs | 32 ++++++++-- crates/core_arch/src/x86/sse2.rs | 25 ++++++-- 4 files changed, 142 insertions(+), 35 deletions(-) diff --git a/crates/core_arch/src/x86/avx.rs b/crates/core_arch/src/x86/avx.rs index f97bab4994..9a8c08b01b 100644 --- a/crates/core_arch/src/x86/avx.rs +++ b/crates/core_arch/src/x86/avx.rs @@ -1425,7 +1425,10 @@ pub fn _mm256_insert_epi32(a: __m256i, i: i32) -> __m256i { /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_pd) #[inline] #[target_feature(enable = "avx")] -#[cfg_attr(test, assert_instr(vmovap))] +#[cfg_attr( + all(test, not(all(target_arch = "x86", target_env = "msvc"))), + assert_instr(vmovap) +)] #[stable(feature = "simd_x86", since = "1.27.0")] #[allow(clippy::cast_ptr_alignment)] pub unsafe fn _mm256_load_pd(mem_addr: *const f64) -> __m256d { @@ -1440,7 +1443,10 @@ pub unsafe fn _mm256_load_pd(mem_addr: *const f64) -> __m256d { /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_pd) #[inline] #[target_feature(enable = "avx")] -#[cfg_attr(test, assert_instr(vmovap))] +#[cfg_attr( + all(test, not(all(target_arch = "x86", target_env = "msvc"))), + assert_instr(vmovap) +)] #[stable(feature = "simd_x86", since = "1.27.0")] #[allow(clippy::cast_ptr_alignment)] pub unsafe fn _mm256_store_pd(mem_addr: *mut f64, a: __m256d) { @@ -1455,7 +1461,10 @@ pub unsafe fn _mm256_store_pd(mem_addr: *mut f64, a: __m256d) { /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_ps) #[inline] #[target_feature(enable = "avx")] -#[cfg_attr(test, assert_instr(vmovaps))] +#[cfg_attr( + all(test, not(all(target_arch = "x86", target_env = "msvc"))), + assert_instr(vmovaps) +)] #[stable(feature = "simd_x86", since = "1.27.0")] #[allow(clippy::cast_ptr_alignment)] pub unsafe fn _mm256_load_ps(mem_addr: *const f32) -> __m256 { @@ -1470,7 +1479,10 @@ pub unsafe fn _mm256_load_ps(mem_addr: *const f32) -> __m256 { /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_ps) #[inline] #[target_feature(enable = "avx")] -#[cfg_attr(test, assert_instr(vmovaps))] +#[cfg_attr( + all(test, not(all(target_arch = "x86", target_env = "msvc"))), + assert_instr(vmovaps) +)] #[stable(feature = "simd_x86", since = "1.27.0")] #[allow(clippy::cast_ptr_alignment)] pub unsafe fn _mm256_store_ps(mem_addr: *mut f32, a: __m256) { @@ -1548,7 +1560,10 @@ pub unsafe fn _mm256_storeu_ps(mem_addr: *mut f32, a: __m256) { /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_si256) #[inline] #[target_feature(enable = "avx")] -#[cfg_attr(test, assert_instr(vmovaps))] // FIXME vmovdqa expected +#[cfg_attr( + all(test, not(all(target_arch = "x86", target_env = "msvc"))), + assert_instr(vmovaps) +)] // FIXME vmovdqa expected #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_load_si256(mem_addr: *const __m256i) -> __m256i { *mem_addr @@ -1561,7 +1576,10 @@ pub unsafe fn _mm256_load_si256(mem_addr: *const __m256i) -> __m256i { /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_si256) #[inline] #[target_feature(enable = "avx")] -#[cfg_attr(test, assert_instr(vmovaps))] // FIXME vmovdqa expected +#[cfg_attr( + all(test, not(all(target_arch = "x86", target_env = "msvc"))), + assert_instr(vmovaps) +)] // FIXME vmovdqa expected #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_store_si256(mem_addr: *mut __m256i, a: __m256i) { *mem_addr = a; diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index a81b64c383..74a2c5ed68 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -34468,7 +34468,10 @@ pub unsafe fn _mm512_storeu_ps(mem_addr: *mut f32, a: __m512) { #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa32 +#[cfg_attr( + all(test, not(all(target_arch = "x86", target_env = "msvc"))), + assert_instr(vmovaps) +)] //should be vmovdqa32 pub unsafe fn _mm512_load_si512(mem_addr: *const __m512i) -> __m512i { ptr::read(mem_addr) } @@ -34479,7 +34482,10 @@ pub unsafe fn _mm512_load_si512(mem_addr: *const __m512i) -> __m512i { #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa32 +#[cfg_attr( + all(test, not(all(target_arch = "x86", target_env = "msvc"))), + assert_instr(vmovaps) +)] //should be vmovdqa32 pub unsafe fn _mm512_store_si512(mem_addr: *mut __m512i, a: __m512i) { ptr::write(mem_addr, a); } @@ -34490,7 +34496,10 @@ pub unsafe fn _mm512_store_si512(mem_addr: *mut __m512i, a: __m512i) { #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa32 +#[cfg_attr( + all(test, not(all(target_arch = "x86", target_env = "msvc"))), + assert_instr(vmovaps) +)] //should be vmovdqa32 pub unsafe fn _mm512_load_epi32(mem_addr: *const i32) -> __m512i { ptr::read(mem_addr as *const __m512i) } @@ -34501,7 +34510,10 @@ pub unsafe fn _mm512_load_epi32(mem_addr: *const i32) -> __m512i { #[inline] #[target_feature(enable = "avx512f,avx512vl")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa32 +#[cfg_attr( + all(test, not(all(target_arch = "x86", target_env = "msvc"))), + assert_instr(vmovaps) +)] //should be vmovdqa32 pub unsafe fn _mm256_load_epi32(mem_addr: *const i32) -> __m256i { ptr::read(mem_addr as *const __m256i) } @@ -34512,7 +34524,10 @@ pub unsafe fn _mm256_load_epi32(mem_addr: *const i32) -> __m256i { #[inline] #[target_feature(enable = "avx512f,avx512vl")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa32 +#[cfg_attr( + all(test, not(all(target_arch = "x86", target_env = "msvc"))), + assert_instr(vmovaps) +)] //should be vmovdqa32 pub unsafe fn _mm_load_epi32(mem_addr: *const i32) -> __m128i { ptr::read(mem_addr as *const __m128i) } @@ -34523,7 +34538,10 @@ pub unsafe fn _mm_load_epi32(mem_addr: *const i32) -> __m128i { #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa32 +#[cfg_attr( + all(test, not(all(target_arch = "x86", target_env = "msvc"))), + assert_instr(vmovaps) +)] //should be vmovdqa32 pub unsafe fn _mm512_store_epi32(mem_addr: *mut i32, a: __m512i) { ptr::write(mem_addr as *mut __m512i, a); } @@ -34534,7 +34552,10 @@ pub unsafe fn _mm512_store_epi32(mem_addr: *mut i32, a: __m512i) { #[inline] #[target_feature(enable = "avx512f,avx512vl")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa32 +#[cfg_attr( + all(test, not(all(target_arch = "x86", target_env = "msvc"))), + assert_instr(vmovaps) +)] //should be vmovdqa32 pub unsafe fn _mm256_store_epi32(mem_addr: *mut i32, a: __m256i) { ptr::write(mem_addr as *mut __m256i, a); } @@ -34545,7 +34566,10 @@ pub unsafe fn _mm256_store_epi32(mem_addr: *mut i32, a: __m256i) { #[inline] #[target_feature(enable = "avx512f,avx512vl")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa32 +#[cfg_attr( + all(test, not(all(target_arch = "x86", target_env = "msvc"))), + assert_instr(vmovaps) +)] //should be vmovdqa32 pub unsafe fn _mm_store_epi32(mem_addr: *mut i32, a: __m128i) { ptr::write(mem_addr as *mut __m128i, a); } @@ -34556,7 +34580,10 @@ pub unsafe fn _mm_store_epi32(mem_addr: *mut i32, a: __m128i) { #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa64 +#[cfg_attr( + all(test, not(all(target_arch = "x86", target_env = "msvc"))), + assert_instr(vmovaps) +)] //should be vmovdqa64 pub unsafe fn _mm512_load_epi64(mem_addr: *const i64) -> __m512i { ptr::read(mem_addr as *const __m512i) } @@ -34567,7 +34594,10 @@ pub unsafe fn _mm512_load_epi64(mem_addr: *const i64) -> __m512i { #[inline] #[target_feature(enable = "avx512f,avx512vl")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa64 +#[cfg_attr( + all(test, not(all(target_arch = "x86", target_env = "msvc"))), + assert_instr(vmovaps) +)] //should be vmovdqa64 pub unsafe fn _mm256_load_epi64(mem_addr: *const i64) -> __m256i { ptr::read(mem_addr as *const __m256i) } @@ -34578,7 +34608,10 @@ pub unsafe fn _mm256_load_epi64(mem_addr: *const i64) -> __m256i { #[inline] #[target_feature(enable = "avx512f,avx512vl")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa64 +#[cfg_attr( + all(test, not(all(target_arch = "x86", target_env = "msvc"))), + assert_instr(vmovaps) +)] //should be vmovdqa64 pub unsafe fn _mm_load_epi64(mem_addr: *const i64) -> __m128i { ptr::read(mem_addr as *const __m128i) } @@ -34589,7 +34622,10 @@ pub unsafe fn _mm_load_epi64(mem_addr: *const i64) -> __m128i { #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa64 +#[cfg_attr( + all(test, not(all(target_arch = "x86", target_env = "msvc"))), + assert_instr(vmovaps) +)] //should be vmovdqa64 pub unsafe fn _mm512_store_epi64(mem_addr: *mut i64, a: __m512i) { ptr::write(mem_addr as *mut __m512i, a); } @@ -34600,7 +34636,10 @@ pub unsafe fn _mm512_store_epi64(mem_addr: *mut i64, a: __m512i) { #[inline] #[target_feature(enable = "avx512f,avx512vl")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa64 +#[cfg_attr( + all(test, not(all(target_arch = "x86", target_env = "msvc"))), + assert_instr(vmovaps) +)] //should be vmovdqa64 pub unsafe fn _mm256_store_epi64(mem_addr: *mut i64, a: __m256i) { ptr::write(mem_addr as *mut __m256i, a); } @@ -34611,7 +34650,10 @@ pub unsafe fn _mm256_store_epi64(mem_addr: *mut i64, a: __m256i) { #[inline] #[target_feature(enable = "avx512f,avx512vl")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa64 +#[cfg_attr( + all(test, not(all(target_arch = "x86", target_env = "msvc"))), + assert_instr(vmovaps) +)] //should be vmovdqa64 pub unsafe fn _mm_store_epi64(mem_addr: *mut i64, a: __m128i) { ptr::write(mem_addr as *mut __m128i, a); } @@ -34622,7 +34664,10 @@ pub unsafe fn _mm_store_epi64(mem_addr: *mut i64, a: __m128i) { #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vmovaps))] +#[cfg_attr( + all(test, not(all(target_arch = "x86", target_env = "msvc"))), + assert_instr(vmovaps) +)] pub unsafe fn _mm512_load_ps(mem_addr: *const f32) -> __m512 { ptr::read(mem_addr as *const __m512) } @@ -34633,7 +34678,10 @@ pub unsafe fn _mm512_load_ps(mem_addr: *const f32) -> __m512 { #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vmovaps))] +#[cfg_attr( + all(test, not(all(target_arch = "x86", target_env = "msvc"))), + assert_instr(vmovaps) +)] pub unsafe fn _mm512_store_ps(mem_addr: *mut f32, a: __m512) { ptr::write(mem_addr as *mut __m512, a); } @@ -34644,7 +34692,10 @@ pub unsafe fn _mm512_store_ps(mem_addr: *mut f32, a: __m512) { #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovapd +#[cfg_attr( + all(test, not(all(target_arch = "x86", target_env = "msvc"))), + assert_instr(vmovaps) +)] //should be vmovapd pub unsafe fn _mm512_load_pd(mem_addr: *const f64) -> __m512d { ptr::read(mem_addr as *const __m512d) } @@ -34655,7 +34706,10 @@ pub unsafe fn _mm512_load_pd(mem_addr: *const f64) -> __m512d { #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovapd +#[cfg_attr( + all(test, not(all(target_arch = "x86", target_env = "msvc"))), + assert_instr(vmovaps) +)] //should be vmovapd pub unsafe fn _mm512_store_pd(mem_addr: *mut f64, a: __m512d) { ptr::write(mem_addr as *mut __m512d, a); } diff --git a/crates/core_arch/src/x86/sse.rs b/crates/core_arch/src/x86/sse.rs index c31d6541a9..a845e822fc 100644 --- a/crates/core_arch/src/x86/sse.rs +++ b/crates/core_arch/src/x86/sse.rs @@ -1158,7 +1158,12 @@ pub unsafe fn _mm_load_ps1(p: *const f32) -> __m128 { /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps) #[inline] #[target_feature(enable = "sse")] -#[cfg_attr(test, assert_instr(movaps))] +// FIXME: Rust doesn't emit alignment attributes for MSVC x86-32. Ref https://github.com/rust-lang/rust/pull/139261 +// All aligned load/store intrinsics are affected +#[cfg_attr( + all(test, not(all(target_arch = "x86", target_env = "msvc"))), + assert_instr(movaps) +)] #[stable(feature = "simd_x86", since = "1.27.0")] #[allow(clippy::cast_ptr_alignment)] pub unsafe fn _mm_load_ps(p: *const f32) -> __m128 { @@ -1213,7 +1218,10 @@ pub unsafe fn _mm_loadu_ps(p: *const f32) -> __m128 { /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_ps) #[inline] #[target_feature(enable = "sse")] -#[cfg_attr(test, assert_instr(movaps))] +#[cfg_attr( + all(test, not(all(target_arch = "x86", target_env = "msvc"))), + assert_instr(movaps) +)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_loadr_ps(p: *const f32) -> __m128 { let a = _mm_load_ps(p); @@ -1253,7 +1261,10 @@ pub unsafe fn _mm_store_ss(p: *mut f32, a: __m128) { /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_ps) #[inline] #[target_feature(enable = "sse")] -#[cfg_attr(test, assert_instr(movaps))] +#[cfg_attr( + all(test, not(all(target_arch = "x86", target_env = "msvc"))), + assert_instr(movaps) +)] #[stable(feature = "simd_x86", since = "1.27.0")] #[allow(clippy::cast_ptr_alignment)] pub unsafe fn _mm_store1_ps(p: *mut f32, a: __m128) { @@ -1266,7 +1277,10 @@ pub unsafe fn _mm_store1_ps(p: *mut f32, a: __m128) { /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps1) #[inline] #[target_feature(enable = "sse")] -#[cfg_attr(test, assert_instr(movaps))] +#[cfg_attr( + all(test, not(all(target_arch = "x86", target_env = "msvc"))), + assert_instr(movaps) +)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_store_ps1(p: *mut f32, a: __m128) { _mm_store1_ps(p, a); @@ -1285,7 +1299,10 @@ pub unsafe fn _mm_store_ps1(p: *mut f32, a: __m128) { /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps) #[inline] #[target_feature(enable = "sse")] -#[cfg_attr(test, assert_instr(movaps))] +#[cfg_attr( + all(test, not(all(target_arch = "x86", target_env = "msvc"))), + assert_instr(movaps) +)] #[stable(feature = "simd_x86", since = "1.27.0")] #[allow(clippy::cast_ptr_alignment)] pub unsafe fn _mm_store_ps(p: *mut f32, a: __m128) { @@ -1329,7 +1346,10 @@ pub unsafe fn _mm_storeu_ps(p: *mut f32, a: __m128) { /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_ps) #[inline] #[target_feature(enable = "sse")] -#[cfg_attr(test, assert_instr(movaps))] +#[cfg_attr( + all(test, not(all(target_arch = "x86", target_env = "msvc"))), + assert_instr(movaps) +)] #[stable(feature = "simd_x86", since = "1.27.0")] #[allow(clippy::cast_ptr_alignment)] pub unsafe fn _mm_storer_ps(p: *mut f32, a: __m128) { diff --git a/crates/core_arch/src/x86/sse2.rs b/crates/core_arch/src/x86/sse2.rs index aeba43e1da..3dabcde18c 100644 --- a/crates/core_arch/src/x86/sse2.rs +++ b/crates/core_arch/src/x86/sse2.rs @@ -1243,7 +1243,10 @@ pub unsafe fn _mm_loadl_epi64(mem_addr: *const __m128i) -> __m128i { /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_si128) #[inline] #[target_feature(enable = "sse2")] -#[cfg_attr(test, assert_instr(movaps))] +#[cfg_attr( + all(test, not(all(target_arch = "x86", target_env = "msvc"))), + assert_instr(movaps) +)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_load_si128(mem_addr: *const __m128i) -> __m128i { *mem_addr @@ -1293,7 +1296,10 @@ pub unsafe fn _mm_maskmoveu_si128(a: __m128i, mask: __m128i, mem_addr: *mut i8) /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_si128) #[inline] #[target_feature(enable = "sse2")] -#[cfg_attr(test, assert_instr(movaps))] +#[cfg_attr( + all(test, not(all(target_arch = "x86", target_env = "msvc"))), + assert_instr(movaps) +)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_store_si128(mem_addr: *mut __m128i, a: __m128i) { *mem_addr = a; @@ -2535,7 +2541,10 @@ pub fn _mm_movemask_pd(a: __m128d) -> i32 { /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd) #[inline] #[target_feature(enable = "sse2")] -#[cfg_attr(test, assert_instr(movaps))] +#[cfg_attr( + all(test, not(all(target_arch = "x86", target_env = "msvc"))), + assert_instr(movaps) +)] #[stable(feature = "simd_x86", since = "1.27.0")] #[allow(clippy::cast_ptr_alignment)] pub unsafe fn _mm_load_pd(mem_addr: *const f64) -> __m128d { @@ -2628,7 +2637,10 @@ pub unsafe fn _mm_store_sd(mem_addr: *mut f64, a: __m128d) { /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd) #[inline] #[target_feature(enable = "sse2")] -#[cfg_attr(test, assert_instr(movaps))] +#[cfg_attr( + all(test, not(all(target_arch = "x86", target_env = "msvc"))), + assert_instr(movaps) +)] #[stable(feature = "simd_x86", since = "1.27.0")] #[allow(clippy::cast_ptr_alignment)] pub unsafe fn _mm_store_pd(mem_addr: *mut f64, a: __m128d) { @@ -2783,7 +2795,10 @@ pub unsafe fn _mm_load_pd1(mem_addr: *const f64) -> __m128d { /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_pd) #[inline] #[target_feature(enable = "sse2")] -#[cfg_attr(test, assert_instr(movaps))] +#[cfg_attr( + all(test, not(all(target_arch = "x86", target_env = "msvc"))), + assert_instr(movaps) +)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_loadr_pd(mem_addr: *const f64) -> __m128d { let a = _mm_load_pd(mem_addr); From 88680a369da551868059294aff62f0503b9a8ccd Mon Sep 17 00:00:00 2001 From: sayantn Date: Mon, 28 Apr 2025 11:31:22 +0530 Subject: [PATCH 2/3] Fix errors in decoupling avx512vl and avx512dq from avx512fp16 --- crates/core_arch/src/x86/avx512fp16.rs | 162 ++++++++++++------------- crates/core_arch/src/x86/mod.rs | 36 ++++-- 2 files changed, 106 insertions(+), 92 deletions(-) diff --git a/crates/core_arch/src/x86/avx512fp16.rs b/crates/core_arch/src/x86/avx512fp16.rs index d0a4f41dac..b6d3d75ed0 100644 --- a/crates/core_arch/src/x86/avx512fp16.rs +++ b/crates/core_arch/src/x86/avx512fp16.rs @@ -249,7 +249,7 @@ pub fn _mm_setzero_ph() -> __m128h { #[target_feature(enable = "avx512fp16,avx512vl")] #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm256_setzero_ph() -> __m256h { - unsafe { transmute(f16x16::ZERO) } + f16x16::ZERO.as_m256h() } /// Return vector of type __m512h with all elements set to zero. @@ -259,7 +259,7 @@ pub fn _mm256_setzero_ph() -> __m256h { #[target_feature(enable = "avx512fp16")] #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm512_setzero_ph() -> __m512h { - unsafe { transmute(f16x32::ZERO) } + f16x32::ZERO.as_m512h() } /// Return vector of type `__m128h` with indetermination elements. @@ -272,7 +272,7 @@ pub fn _mm512_setzero_ph() -> __m512h { #[target_feature(enable = "avx512fp16,avx512vl")] #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_undefined_ph() -> __m128h { - unsafe { transmute(f16x8::ZERO) } + f16x8::ZERO.as_m128h() } /// Return vector of type `__m256h` with indetermination elements. @@ -285,7 +285,7 @@ pub fn _mm_undefined_ph() -> __m128h { #[target_feature(enable = "avx512fp16,avx512vl")] #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm256_undefined_ph() -> __m256h { - unsafe { transmute(f16x16::ZERO) } + f16x16::ZERO.as_m256h() } /// Return vector of type `__m512h` with indetermination elements. @@ -298,7 +298,7 @@ pub fn _mm256_undefined_ph() -> __m256h { #[target_feature(enable = "avx512fp16")] #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm512_undefined_ph() -> __m512h { - unsafe { transmute(f16x32::ZERO) } + f16x32::ZERO.as_m512h() } /// Cast vector of type `__m128d` to type `__m128h`. This intrinsic is only used for compilation and @@ -1552,7 +1552,7 @@ pub fn _mm512_maskz_add_round_ph( #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_add_round_sh(a: __m128h, b: __m128h) -> __m128h { static_assert_rounding!(ROUNDING); - _mm_mask_add_round_sh::(_mm_undefined_ph(), 0xff, a, b) + _mm_mask_add_round_sh::(f16x8::ZERO.as_m128h(), 0xff, a, b) } /// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the @@ -1603,7 +1603,7 @@ pub fn _mm_mask_add_round_sh( #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_maskz_add_round_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h { static_assert_rounding!(ROUNDING); - _mm_mask_add_round_sh::(_mm_setzero_ph(), k, a, b) + _mm_mask_add_round_sh::(f16x8::ZERO.as_m128h(), k, a, b) } /// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the @@ -1864,7 +1864,7 @@ pub fn _mm512_maskz_sub_round_ph( #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_sub_round_sh(a: __m128h, b: __m128h) -> __m128h { static_assert_rounding!(ROUNDING); - _mm_mask_sub_round_sh::(_mm_undefined_ph(), 0xff, a, b) + _mm_mask_sub_round_sh::(f16x8::ZERO.as_m128h(), 0xff, a, b) } /// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the @@ -1915,7 +1915,7 @@ pub fn _mm_mask_sub_round_sh( #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_maskz_sub_round_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h { static_assert_rounding!(ROUNDING); - _mm_mask_sub_round_sh::(_mm_setzero_ph(), k, a, b) + _mm_mask_sub_round_sh::(f16x8::ZERO.as_m128h(), k, a, b) } /// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the @@ -2176,7 +2176,7 @@ pub fn _mm512_maskz_mul_round_ph( #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_mul_round_sh(a: __m128h, b: __m128h) -> __m128h { static_assert_rounding!(ROUNDING); - _mm_mask_mul_round_sh::(_mm_undefined_ph(), 0xff, a, b) + _mm_mask_mul_round_sh::(f16x8::ZERO.as_m128h(), 0xff, a, b) } /// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the @@ -2227,7 +2227,7 @@ pub fn _mm_mask_mul_round_sh( #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_maskz_mul_round_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h { static_assert_rounding!(ROUNDING); - _mm_mask_mul_round_sh::(_mm_setzero_ph(), k, a, b) + _mm_mask_mul_round_sh::(f16x8::ZERO.as_m128h(), k, a, b) } /// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the @@ -2488,7 +2488,7 @@ pub fn _mm512_maskz_div_round_ph( #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_div_round_sh(a: __m128h, b: __m128h) -> __m128h { static_assert_rounding!(ROUNDING); - _mm_mask_div_round_sh::(_mm_undefined_ph(), 0xff, a, b) + _mm_mask_div_round_sh::(f16x8::ZERO.as_m128h(), 0xff, a, b) } /// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the @@ -2539,7 +2539,7 @@ pub fn _mm_mask_div_round_sh( #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_maskz_div_round_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h { static_assert_rounding!(ROUNDING); - _mm_mask_div_round_sh::(_mm_setzero_ph(), k, a, b) + _mm_mask_div_round_sh::(f16x8::ZERO.as_m128h(), k, a, b) } /// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the @@ -2794,7 +2794,7 @@ pub fn _mm512_maskz_mul_round_pch( #[cfg_attr(test, assert_instr(vfmulcsh))] #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_mul_sch(a: __m128h, b: __m128h) -> __m128h { - _mm_mask_mul_sch(_mm_undefined_ph(), 0xff, a, b) + _mm_mask_mul_sch(f16x8::ZERO.as_m128h(), 0xff, a, b) } /// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst using @@ -2822,7 +2822,7 @@ pub fn _mm_mask_mul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __ #[cfg_attr(test, assert_instr(vfmulcsh))] #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_maskz_mul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h { - _mm_mask_mul_sch(_mm_setzero_ph(), k, a, b) + _mm_mask_mul_sch(f16x8::ZERO.as_m128h(), k, a, b) } /// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst, @@ -2846,7 +2846,7 @@ pub fn _mm_maskz_mul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h { #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_mul_round_sch(a: __m128h, b: __m128h) -> __m128h { static_assert_rounding!(ROUNDING); - _mm_mask_mul_round_sch::(_mm_undefined_ph(), 0xff, a, b) + _mm_mask_mul_round_sch::(f16x8::ZERO.as_m128h(), 0xff, a, b) } /// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst using @@ -2911,7 +2911,7 @@ pub fn _mm_maskz_mul_round_sch( b: __m128h, ) -> __m128h { static_assert_rounding!(ROUNDING); - _mm_mask_mul_round_sch::(_mm_setzero_ph(), k, a, b) + _mm_mask_mul_round_sch::(f16x8::ZERO.as_m128h(), k, a, b) } /// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is @@ -3445,7 +3445,7 @@ pub fn _mm512_maskz_cmul_round_pch( #[cfg_attr(test, assert_instr(vfcmulcsh))] #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_cmul_sch(a: __m128h, b: __m128h) -> __m128h { - _mm_mask_cmul_sch(_mm_undefined_ph(), 0xff, a, b) + _mm_mask_cmul_sch(f16x8::ZERO.as_m128h(), 0xff, a, b) } /// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b, @@ -3473,7 +3473,7 @@ pub fn _mm_mask_cmul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> _ #[cfg_attr(test, assert_instr(vfcmulcsh))] #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_maskz_cmul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h { - _mm_mask_cmul_sch(_mm_setzero_ph(), k, a, b) + _mm_mask_cmul_sch(f16x8::ZERO.as_m128h(), k, a, b) } /// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b, @@ -3496,7 +3496,7 @@ pub fn _mm_maskz_cmul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h { #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_cmul_round_sch(a: __m128h, b: __m128h) -> __m128h { static_assert_rounding!(ROUNDING); - _mm_mask_cmul_round_sch::(_mm_undefined_ph(), 0xff, a, b) + _mm_mask_cmul_round_sch::(f16x8::ZERO.as_m128h(), 0xff, a, b) } /// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b, @@ -3561,7 +3561,7 @@ pub fn _mm_maskz_cmul_round_sch( b: __m128h, ) -> __m128h { static_assert_rounding!(ROUNDING); - _mm_mask_cmul_round_sch::(_mm_setzero_ph(), k, a, b) + _mm_mask_cmul_round_sch::(f16x8::ZERO.as_m128h(), k, a, b) } /// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and @@ -7782,7 +7782,7 @@ pub fn _mm512_maskz_rcp_ph(k: __mmask32, a: __m512h) -> __m512h { #[cfg_attr(test, assert_instr(vrcpsh))] #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_rcp_sh(a: __m128h, b: __m128h) -> __m128h { - _mm_mask_rcp_sh(_mm_undefined_ph(), 0xff, a, b) + _mm_mask_rcp_sh(f16x8::ZERO.as_m128h(), 0xff, a, b) } /// Compute the approximate reciprocal of the lower half-precision (16-bit) floating-point element in b, @@ -7810,7 +7810,7 @@ pub fn _mm_mask_rcp_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m #[cfg_attr(test, assert_instr(vrcpsh))] #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_maskz_rcp_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h { - _mm_mask_rcp_sh(_mm_setzero_ph(), k, a, b) + _mm_mask_rcp_sh(f16x8::ZERO.as_m128h(), k, a, b) } /// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point @@ -7947,7 +7947,7 @@ pub fn _mm512_maskz_rsqrt_ph(k: __mmask32, a: __m512h) -> __m512h { #[cfg_attr(test, assert_instr(vrsqrtsh))] #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_rsqrt_sh(a: __m128h, b: __m128h) -> __m128h { - _mm_mask_rsqrt_sh(_mm_undefined_ph(), 0xff, a, b) + _mm_mask_rsqrt_sh(f16x8::ZERO.as_m128h(), 0xff, a, b) } /// Compute the approximate reciprocal square root of the lower half-precision (16-bit) floating-point @@ -7975,7 +7975,7 @@ pub fn _mm_mask_rsqrt_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> _ #[cfg_attr(test, assert_instr(vrsqrtsh))] #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_maskz_rsqrt_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h { - _mm_mask_rsqrt_sh(_mm_setzero_ph(), k, a, b) + _mm_mask_rsqrt_sh(f16x8::ZERO.as_m128h(), k, a, b) } /// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the @@ -8169,7 +8169,7 @@ pub fn _mm512_maskz_sqrt_round_ph(k: __mmask32, a: __m512h) #[cfg_attr(test, assert_instr(vsqrtsh))] #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_sqrt_sh(a: __m128h, b: __m128h) -> __m128h { - _mm_mask_sqrt_sh(_mm_undefined_ph(), 0xff, a, b) + _mm_mask_sqrt_sh(f16x8::ZERO.as_m128h(), 0xff, a, b) } /// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store @@ -8195,7 +8195,7 @@ pub fn _mm_mask_sqrt_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __ #[cfg_attr(test, assert_instr(vsqrtsh))] #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_maskz_sqrt_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h { - _mm_mask_sqrt_sh(_mm_setzero_ph(), k, a, b) + _mm_mask_sqrt_sh(f16x8::ZERO.as_m128h(), k, a, b) } /// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store @@ -8217,7 +8217,7 @@ pub fn _mm_maskz_sqrt_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h { #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_sqrt_round_sh(a: __m128h, b: __m128h) -> __m128h { static_assert_rounding!(ROUNDING); - _mm_mask_sqrt_round_sh::(_mm_undefined_ph(), 0xff, a, b) + _mm_mask_sqrt_round_sh::(f16x8::ZERO.as_m128h(), 0xff, a, b) } /// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store @@ -8272,7 +8272,7 @@ pub fn _mm_maskz_sqrt_round_sh( b: __m128h, ) -> __m128h { static_assert_rounding!(ROUNDING); - _mm_mask_sqrt_round_sh::(_mm_setzero_ph(), k, a, b) + _mm_mask_sqrt_round_sh::(f16x8::ZERO.as_m128h(), k, a, b) } /// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum @@ -8496,7 +8496,7 @@ pub fn _mm_mask_max_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m #[cfg_attr(test, assert_instr(vmaxsh))] #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_maskz_max_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h { - _mm_mask_max_sh(_mm_setzero_ph(), k, a, b) + _mm_mask_max_sh(f16x8::ZERO.as_m128h(), k, a, b) } /// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum value @@ -8553,7 +8553,7 @@ pub fn _mm_mask_max_round_sh( #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_maskz_max_round_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h { static_assert_sae!(SAE); - _mm_mask_max_round_sh::(_mm_setzero_ph(), k, a, b) + _mm_mask_max_round_sh::(f16x8::ZERO.as_m128h(), k, a, b) } /// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum @@ -8776,7 +8776,7 @@ pub fn _mm_mask_min_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m #[cfg_attr(test, assert_instr(vminsh))] #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_maskz_min_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h { - _mm_mask_min_sh(_mm_setzero_ph(), k, a, b) + _mm_mask_min_sh(f16x8::ZERO.as_m128h(), k, a, b) } /// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum value @@ -8833,7 +8833,7 @@ pub fn _mm_mask_min_round_sh( #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_maskz_min_round_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h { static_assert_sae!(SAE); - _mm_mask_min_round_sh::(_mm_setzero_ph(), k, a, b) + _mm_mask_min_round_sh::(f16x8::ZERO.as_m128h(), k, a, b) } /// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision @@ -9024,7 +9024,7 @@ pub fn _mm512_maskz_getexp_round_ph(k: __mmask32, a: __m512h) -> #[cfg_attr(test, assert_instr(vgetexpsh))] #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_getexp_sh(a: __m128h, b: __m128h) -> __m128h { - _mm_mask_getexp_sh(_mm_undefined_ph(), 0xff, a, b) + _mm_mask_getexp_sh(f16x8::ZERO.as_m128h(), 0xff, a, b) } /// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision @@ -9054,7 +9054,7 @@ pub fn _mm_mask_getexp_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> #[cfg_attr(test, assert_instr(vgetexpsh))] #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_maskz_getexp_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h { - _mm_mask_getexp_sh(_mm_setzero_ph(), k, a, b) + _mm_mask_getexp_sh(f16x8::ZERO.as_m128h(), k, a, b) } /// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision @@ -9071,7 +9071,7 @@ pub fn _mm_maskz_getexp_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h { #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_getexp_round_sh(a: __m128h, b: __m128h) -> __m128h { static_assert_sae!(SAE); - _mm_mask_getexp_round_sh::(_mm_undefined_ph(), 0xff, a, b) + _mm_mask_getexp_round_sh::(f16x8::ZERO.as_m128h(), 0xff, a, b) } /// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision @@ -9112,7 +9112,7 @@ pub fn _mm_mask_getexp_round_sh( #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_maskz_getexp_round_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h { static_assert_sae!(SAE); - _mm_mask_getexp_round_sh::(_mm_setzero_ph(), k, a, b) + _mm_mask_getexp_round_sh::(f16x8::ZERO.as_m128h(), k, a, b) } /// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store @@ -9585,7 +9585,7 @@ pub fn _mm_getmant_sh __m128h { static_assert_uimm_bits!(NORM, 4); static_assert_uimm_bits!(SIGN, 2); - _mm_mask_getmant_sh::(_mm_undefined_ph(), 0xff, a, b) + _mm_mask_getmant_sh::(f16x8::ZERO.as_m128h(), 0xff, a, b) } /// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store @@ -9662,7 +9662,7 @@ pub fn _mm_maskz_getmant_sh< ) -> __m128h { static_assert_uimm_bits!(NORM, 4); static_assert_uimm_bits!(SIGN, 2); - _mm_mask_getmant_sh::(_mm_setzero_ph(), k, a, b) + _mm_mask_getmant_sh::(f16x8::ZERO.as_m128h(), k, a, b) } /// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store @@ -9703,7 +9703,7 @@ pub fn _mm_getmant_round_sh< static_assert_uimm_bits!(NORM, 4); static_assert_uimm_bits!(SIGN, 2); static_assert_sae!(SAE); - _mm_mask_getmant_round_sh::(_mm_undefined_ph(), 0xff, a, b) + _mm_mask_getmant_round_sh::(f16x8::ZERO.as_m128h(), 0xff, a, b) } /// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store @@ -9790,7 +9790,7 @@ pub fn _mm_maskz_getmant_round_sh< static_assert_uimm_bits!(NORM, 4); static_assert_uimm_bits!(SIGN, 2); static_assert_sae!(SAE); - _mm_mask_getmant_round_sh::(_mm_setzero_ph(), k, a, b) + _mm_mask_getmant_round_sh::(f16x8::ZERO.as_m128h(), k, a, b) } /// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits @@ -10111,7 +10111,7 @@ pub fn _mm512_maskz_roundscale_round_ph( #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_roundscale_sh(a: __m128h, b: __m128h) -> __m128h { static_assert_uimm_bits!(IMM8, 8); - _mm_mask_roundscale_sh::(_mm_undefined_ph(), 0xff, a, b) + _mm_mask_roundscale_sh::(f16x8::ZERO.as_m128h(), 0xff, a, b) } /// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits @@ -10162,7 +10162,7 @@ pub fn _mm_mask_roundscale_sh( #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_maskz_roundscale_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h { static_assert_uimm_bits!(IMM8, 8); - _mm_mask_roundscale_sh::(_mm_setzero_ph(), k, a, b) + _mm_mask_roundscale_sh::(f16x8::ZERO.as_m128h(), k, a, b) } /// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits @@ -10188,7 +10188,7 @@ pub fn _mm_maskz_roundscale_sh(k: __mmask8, a: __m128h, b: __m1 pub fn _mm_roundscale_round_sh(a: __m128h, b: __m128h) -> __m128h { static_assert_uimm_bits!(IMM8, 8); static_assert_sae!(SAE); - _mm_mask_roundscale_round_sh::(_mm_undefined_ph(), 0xff, a, b) + _mm_mask_roundscale_round_sh::(f16x8::ZERO.as_m128h(), 0xff, a, b) } /// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits @@ -10251,7 +10251,7 @@ pub fn _mm_maskz_roundscale_round_sh( ) -> __m128h { static_assert_uimm_bits!(IMM8, 8); static_assert_sae!(SAE); - _mm_mask_roundscale_round_sh::(_mm_setzero_ph(), k, a, b) + _mm_mask_roundscale_round_sh::(f16x8::ZERO.as_m128h(), k, a, b) } /// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store @@ -10449,7 +10449,7 @@ pub fn _mm512_maskz_scalef_round_ph( #[cfg_attr(test, assert_instr(vscalefsh))] #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_scalef_sh(a: __m128h, b: __m128h) -> __m128h { - _mm_mask_scalef_sh(_mm_undefined_ph(), 0xff, a, b) + _mm_mask_scalef_sh(f16x8::ZERO.as_m128h(), 0xff, a, b) } /// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store @@ -10475,7 +10475,7 @@ pub fn _mm_mask_scalef_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> #[cfg_attr(test, assert_instr(vscalefsh))] #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_maskz_scalef_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h { - _mm_mask_scalef_sh(_mm_setzero_ph(), k, a, b) + _mm_mask_scalef_sh(f16x8::ZERO.as_m128h(), k, a, b) } /// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store @@ -10498,7 +10498,7 @@ pub fn _mm_maskz_scalef_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h { #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_scalef_round_sh(a: __m128h, b: __m128h) -> __m128h { static_assert_rounding!(ROUNDING); - _mm_mask_scalef_round_sh::(_mm_undefined_ph(), 0xff, a, b) + _mm_mask_scalef_round_sh::(f16x8::ZERO.as_m128h(), 0xff, a, b) } /// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store @@ -10555,7 +10555,7 @@ pub fn _mm_maskz_scalef_round_sh( b: __m128h, ) -> __m128h { static_assert_rounding!(ROUNDING); - _mm_mask_scalef_round_sh::(_mm_setzero_ph(), k, a, b) + _mm_mask_scalef_round_sh::(f16x8::ZERO.as_m128h(), k, a, b) } /// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the @@ -10872,7 +10872,7 @@ pub fn _mm512_maskz_reduce_round_ph( #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_reduce_sh(a: __m128h, b: __m128h) -> __m128h { static_assert_uimm_bits!(IMM8, 8); - _mm_mask_reduce_sh::(_mm_undefined_ph(), 0xff, a, b) + _mm_mask_reduce_sh::(f16x8::ZERO.as_m128h(), 0xff, a, b) } /// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by @@ -10925,7 +10925,7 @@ pub fn _mm_mask_reduce_sh( #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_maskz_reduce_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h { static_assert_uimm_bits!(IMM8, 8); - _mm_mask_reduce_sh::(_mm_setzero_ph(), k, a, b) + _mm_mask_reduce_sh::(f16x8::ZERO.as_m128h(), k, a, b) } /// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by @@ -10951,7 +10951,7 @@ pub fn _mm_maskz_reduce_sh(k: __mmask8, a: __m128h, b: __m128h) pub fn _mm_reduce_round_sh(a: __m128h, b: __m128h) -> __m128h { static_assert_uimm_bits!(IMM8, 8); static_assert_sae!(SAE); - _mm_mask_reduce_round_sh::(_mm_undefined_ph(), 0xff, a, b) + _mm_mask_reduce_round_sh::(f16x8::ZERO.as_m128h(), 0xff, a, b) } /// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by @@ -11016,7 +11016,7 @@ pub fn _mm_maskz_reduce_round_sh( ) -> __m128h { static_assert_uimm_bits!(IMM8, 8); static_assert_sae!(SAE); - _mm_mask_reduce_round_sh::(_mm_setzero_ph(), k, a, b) + _mm_mask_reduce_round_sh::(f16x8::ZERO.as_m128h(), k, a, b) } /// Reduce the packed half-precision (16-bit) floating-point elements in a by addition. Returns the @@ -12060,7 +12060,7 @@ pub fn _mm512_mask_cvtepi32_ph(src: __m256h, k: __mmask16, a: __m512i) -> __m256 #[cfg_attr(test, assert_instr(vcvtdq2ph))] #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm512_maskz_cvtepi32_ph(k: __mmask16, a: __m512i) -> __m256h { - _mm512_mask_cvtepi32_ph(_mm256_setzero_ph(), k, a) + _mm512_mask_cvtepi32_ph(f16x16::ZERO.as_m256h(), k, a) } /// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements, @@ -12135,7 +12135,7 @@ pub fn _mm512_mask_cvt_roundepi32_ph( #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm512_maskz_cvt_roundepi32_ph(k: __mmask16, a: __m512i) -> __m256h { static_assert_rounding!(ROUNDING); - _mm512_mask_cvt_roundepi32_ph::(_mm256_setzero_ph(), k, a) + _mm512_mask_cvt_roundepi32_ph::(f16x16::ZERO.as_m256h(), k, a) } /// Convert the signed 32-bit integer b to a half-precision (16-bit) floating-point element, store the @@ -12285,7 +12285,7 @@ pub fn _mm512_mask_cvtepu32_ph(src: __m256h, k: __mmask16, a: __m512i) -> __m256 #[cfg_attr(test, assert_instr(vcvtudq2ph))] #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm512_maskz_cvtepu32_ph(k: __mmask16, a: __m512i) -> __m256h { - _mm512_mask_cvtepu32_ph(_mm256_setzero_ph(), k, a) + _mm512_mask_cvtepu32_ph(f16x16::ZERO.as_m256h(), k, a) } /// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements, @@ -12360,7 +12360,7 @@ pub fn _mm512_mask_cvt_roundepu32_ph( #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm512_maskz_cvt_roundepu32_ph(k: __mmask16, a: __m512i) -> __m256h { static_assert_rounding!(ROUNDING); - _mm512_mask_cvt_roundepu32_ph::(_mm256_setzero_ph(), k, a) + _mm512_mask_cvt_roundepu32_ph::(f16x16::ZERO.as_m256h(), k, a) } /// Convert the unsigned 32-bit integer b to a half-precision (16-bit) floating-point element, store the @@ -12511,7 +12511,7 @@ pub fn _mm512_mask_cvtepi64_ph(src: __m128h, k: __mmask8, a: __m512i) -> __m128h #[cfg_attr(test, assert_instr(vcvtqq2ph))] #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm512_maskz_cvtepi64_ph(k: __mmask8, a: __m512i) -> __m128h { - _mm512_mask_cvtepi64_ph(_mm_setzero_ph(), k, a) + _mm512_mask_cvtepi64_ph(f16x8::ZERO.as_m128h(), k, a) } /// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements, @@ -12586,7 +12586,7 @@ pub fn _mm512_mask_cvt_roundepi64_ph( #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm512_maskz_cvt_roundepi64_ph(k: __mmask8, a: __m512i) -> __m128h { static_assert_rounding!(ROUNDING); - _mm512_mask_cvt_roundepi64_ph::(_mm_setzero_ph(), k, a) + _mm512_mask_cvt_roundepi64_ph::(f16x8::ZERO.as_m128h(), k, a) } /// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements, @@ -12699,7 +12699,7 @@ pub fn _mm512_mask_cvtepu64_ph(src: __m128h, k: __mmask8, a: __m512i) -> __m128h #[cfg_attr(test, assert_instr(vcvtuqq2ph))] #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm512_maskz_cvtepu64_ph(k: __mmask8, a: __m512i) -> __m128h { - _mm512_mask_cvtepu64_ph(_mm_setzero_ph(), k, a) + _mm512_mask_cvtepu64_ph(f16x8::ZERO.as_m128h(), k, a) } /// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements, @@ -12774,7 +12774,7 @@ pub fn _mm512_mask_cvt_roundepu64_ph( #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm512_maskz_cvt_roundepu64_ph(k: __mmask8, a: __m512i) -> __m128h { static_assert_rounding!(ROUNDING); - _mm512_mask_cvt_roundepu64_ph::(_mm_setzero_ph(), k, a) + _mm512_mask_cvt_roundepu64_ph::(f16x8::ZERO.as_m128h(), k, a) } /// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) @@ -12862,7 +12862,7 @@ pub fn _mm256_maskz_cvtxps_ph(k: __mmask8, a: __m256) -> __m128h { #[cfg_attr(test, assert_instr(vcvtps2phx))] #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm512_cvtxps_ph(a: __m512) -> __m256h { - _mm512_mask_cvtxps_ph(_mm256_setzero_ph(), 0xffff, a) + _mm512_mask_cvtxps_ph(f16x16::ZERO.as_m256h(), 0xffff, a) } /// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) @@ -12888,7 +12888,7 @@ pub fn _mm512_mask_cvtxps_ph(src: __m256h, k: __mmask16, a: __m512) -> __m256h { #[cfg_attr(test, assert_instr(vcvtps2phx))] #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm512_maskz_cvtxps_ph(k: __mmask16, a: __m512) -> __m256h { - _mm512_mask_cvtxps_ph(_mm256_setzero_ph(), k, a) + _mm512_mask_cvtxps_ph(f16x16::ZERO.as_m256h(), k, a) } /// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) @@ -12910,7 +12910,7 @@ pub fn _mm512_maskz_cvtxps_ph(k: __mmask16, a: __m512) -> __m256h { #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm512_cvtx_roundps_ph(a: __m512) -> __m256h { static_assert_rounding!(ROUNDING); - _mm512_mask_cvtx_roundps_ph::(_mm256_setzero_ph(), 0xffff, a) + _mm512_mask_cvtx_roundps_ph::(f16x16::ZERO.as_m256h(), 0xffff, a) } /// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) @@ -12962,7 +12962,7 @@ pub fn _mm512_mask_cvtx_roundps_ph( #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm512_maskz_cvtx_roundps_ph(k: __mmask16, a: __m512) -> __m256h { static_assert_rounding!(ROUNDING); - _mm512_mask_cvtx_roundps_ph::(_mm256_setzero_ph(), k, a) + _mm512_mask_cvtx_roundps_ph::(f16x16::ZERO.as_m256h(), k, a) } /// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit) @@ -12975,7 +12975,7 @@ pub fn _mm512_maskz_cvtx_roundps_ph(k: __mmask16, a: __m512 #[cfg_attr(test, assert_instr(vcvtss2sh))] #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_cvtss_sh(a: __m128h, b: __m128) -> __m128h { - _mm_mask_cvtss_sh(_mm_undefined_ph(), 0xff, a, b) + _mm_mask_cvtss_sh(f16x8::ZERO.as_m128h(), 0xff, a, b) } /// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit) @@ -13003,7 +13003,7 @@ pub fn _mm_mask_cvtss_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128) -> __ #[cfg_attr(test, assert_instr(vcvtss2sh))] #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_maskz_cvtss_sh(k: __mmask8, a: __m128h, b: __m128) -> __m128h { - _mm_mask_cvtss_sh(_mm_setzero_ph(), k, a, b) + _mm_mask_cvtss_sh(f16x8::ZERO.as_m128h(), k, a, b) } /// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit) @@ -13026,7 +13026,7 @@ pub fn _mm_maskz_cvtss_sh(k: __mmask8, a: __m128h, b: __m128) -> __m128h { #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_cvt_roundss_sh(a: __m128h, b: __m128) -> __m128h { static_assert_rounding!(ROUNDING); - _mm_mask_cvt_roundss_sh::(_mm_undefined_ph(), 0xff, a, b) + _mm_mask_cvt_roundss_sh::(f16x8::ZERO.as_m128h(), 0xff, a, b) } /// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit) @@ -13085,7 +13085,7 @@ pub fn _mm_maskz_cvt_roundss_sh( b: __m128, ) -> __m128h { static_assert_rounding!(ROUNDING); - _mm_mask_cvt_roundss_sh::(_mm_setzero_ph(), k, a, b) + _mm_mask_cvt_roundss_sh::(f16x8::ZERO.as_m128h(), k, a, b) } /// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit) @@ -13173,7 +13173,7 @@ pub fn _mm256_maskz_cvtpd_ph(k: __mmask8, a: __m256d) -> __m128h { #[cfg_attr(test, assert_instr(vcvtpd2ph))] #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm512_cvtpd_ph(a: __m512d) -> __m128h { - _mm512_mask_cvtpd_ph(_mm_setzero_ph(), 0xff, a) + _mm512_mask_cvtpd_ph(f16x8::ZERO.as_m128h(), 0xff, a) } /// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit) @@ -13199,7 +13199,7 @@ pub fn _mm512_mask_cvtpd_ph(src: __m128h, k: __mmask8, a: __m512d) -> __m128h { #[cfg_attr(test, assert_instr(vcvtpd2ph))] #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm512_maskz_cvtpd_ph(k: __mmask8, a: __m512d) -> __m128h { - _mm512_mask_cvtpd_ph(_mm_setzero_ph(), k, a) + _mm512_mask_cvtpd_ph(f16x8::ZERO.as_m128h(), k, a) } /// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit) @@ -13221,7 +13221,7 @@ pub fn _mm512_maskz_cvtpd_ph(k: __mmask8, a: __m512d) -> __m128h { #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm512_cvt_roundpd_ph(a: __m512d) -> __m128h { static_assert_rounding!(ROUNDING); - _mm512_mask_cvt_roundpd_ph::(_mm_setzero_ph(), 0xff, a) + _mm512_mask_cvt_roundpd_ph::(f16x8::ZERO.as_m128h(), 0xff, a) } /// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit) @@ -13273,7 +13273,7 @@ pub fn _mm512_mask_cvt_roundpd_ph( #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm512_maskz_cvt_roundpd_ph(k: __mmask8, a: __m512d) -> __m128h { static_assert_rounding!(ROUNDING); - _mm512_mask_cvt_roundpd_ph::(_mm_setzero_ph(), k, a) + _mm512_mask_cvt_roundpd_ph::(f16x8::ZERO.as_m128h(), k, a) } /// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit) @@ -13286,7 +13286,7 @@ pub fn _mm512_maskz_cvt_roundpd_ph(k: __mmask8, a: __m512d) #[cfg_attr(test, assert_instr(vcvtsd2sh))] #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_cvtsd_sh(a: __m128h, b: __m128d) -> __m128h { - _mm_mask_cvtsd_sh(_mm_undefined_ph(), 0xff, a, b) + _mm_mask_cvtsd_sh(f16x8::ZERO.as_m128h(), 0xff, a, b) } /// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit) @@ -13314,7 +13314,7 @@ pub fn _mm_mask_cvtsd_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128d) -> _ #[cfg_attr(test, assert_instr(vcvtsd2sh))] #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_maskz_cvtsd_sh(k: __mmask8, a: __m128h, b: __m128d) -> __m128h { - _mm_mask_cvtsd_sh(_mm_setzero_ph(), k, a, b) + _mm_mask_cvtsd_sh(f16x8::ZERO.as_m128h(), k, a, b) } /// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit) @@ -13337,7 +13337,7 @@ pub fn _mm_maskz_cvtsd_sh(k: __mmask8, a: __m128h, b: __m128d) -> __m128h { #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_cvt_roundsd_sh(a: __m128h, b: __m128d) -> __m128h { static_assert_rounding!(ROUNDING); - _mm_mask_cvt_roundsd_sh::(_mm_undefined_ph(), 0xff, a, b) + _mm_mask_cvt_roundsd_sh::(f16x8::ZERO.as_m128h(), 0xff, a, b) } /// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit) @@ -13396,7 +13396,7 @@ pub fn _mm_maskz_cvt_roundsd_sh( b: __m128d, ) -> __m128h { static_assert_rounding!(ROUNDING); - _mm_mask_cvt_roundsd_sh::(_mm_setzero_ph(), k, a, b) + _mm_mask_cvt_roundsd_sh::(f16x8::ZERO.as_m128h(), k, a, b) } /// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and @@ -15899,7 +15899,7 @@ pub fn _mm_mask_cvtsh_ss(src: __m128, k: __mmask8, a: __m128, b: __m128h) -> __m #[cfg_attr(test, assert_instr(vcvtsh2ss))] #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_maskz_cvtsh_ss(k: __mmask8, a: __m128, b: __m128h) -> __m128 { - _mm_mask_cvtsh_ss(_mm_setzero_ps(), k, a, b) + _mm_mask_cvtsh_ss(_mm_set_ss(0.0), k, a, b) } /// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit) @@ -15959,7 +15959,7 @@ pub fn _mm_mask_cvt_roundsh_ss( #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_maskz_cvt_roundsh_ss(k: __mmask8, a: __m128, b: __m128h) -> __m128 { static_assert_sae!(SAE); - _mm_mask_cvt_roundsh_ss::(_mm_setzero_ps(), k, a, b) + _mm_mask_cvt_roundsh_ss::(_mm_set_ss(0.0), k, a, b) } /// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit) @@ -16169,7 +16169,7 @@ pub fn _mm_mask_cvtsh_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128h) -> _ #[cfg_attr(test, assert_instr(vcvtsh2sd))] #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_maskz_cvtsh_sd(k: __mmask8, a: __m128d, b: __m128h) -> __m128d { - _mm_mask_cvtsh_sd(_mm_setzero_pd(), k, a, b) + _mm_mask_cvtsh_sd(_mm_set_sd(0.0), k, a, b) } /// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit) @@ -16228,7 +16228,7 @@ pub fn _mm_mask_cvt_roundsh_sd( #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] pub fn _mm_maskz_cvt_roundsh_sd(k: __mmask8, a: __m128d, b: __m128h) -> __m128d { static_assert_sae!(SAE); - _mm_mask_cvt_roundsh_sd::(_mm_setzero_pd(), k, a, b) + _mm_mask_cvt_roundsh_sd::(_mm_set_sd(0.0), k, a, b) } /// Copy the lower half-precision (16-bit) floating-point element from `a` to `dst`. diff --git a/crates/core_arch/src/x86/mod.rs b/crates/core_arch/src/x86/mod.rs index 68cf9b5960..d59b120fa5 100644 --- a/crates/core_arch/src/x86/mod.rs +++ b/crates/core_arch/src/x86/mod.rs @@ -517,17 +517,26 @@ mod test; pub use self::test::*; macro_rules! as_transmute { - ($from:ty => $($name:ident -> $to:ident),* $(,)?) => { + ($from:ty => $as_from:ident, $($as_to:ident -> $to:ident),* $(,)?) => { impl $from {$( #[inline] - pub(crate) fn $name(self) -> crate::core_arch::simd::$to { + pub(crate) fn $as_to(self) -> crate::core_arch::simd::$to { unsafe { transmute(self) } } )*} + $( + impl crate::core_arch::simd::$to { + #[inline] + pub(crate) fn $as_from(self) -> $from { + unsafe { transmute(self) } + } + } + )* }; } as_transmute!(__m128i => + as_m128i, as_u8x16 -> u8x16, as_u16x8 -> u16x8, as_u32x4 -> u32x4, @@ -538,6 +547,7 @@ as_transmute!(__m128i => as_i64x2 -> i64x2, ); as_transmute!(__m256i => + as_m256i, as_u8x32 -> u8x32, as_u16x16 -> u16x16, as_u32x8 -> u32x8, @@ -548,6 +558,7 @@ as_transmute!(__m256i => as_i64x4 -> i64x4, ); as_transmute!(__m512i => + as_m512i, as_u8x64 -> u8x64, as_u16x32 -> u16x32, as_u32x16 -> u32x16, @@ -558,35 +569,38 @@ as_transmute!(__m512i => as_i64x8 -> i64x8, ); -as_transmute!(__m128 => as_f32x4 -> f32x4); -as_transmute!(__m128d => as_f64x2 -> f64x2); -as_transmute!(__m256 => as_f32x8 -> f32x8); -as_transmute!(__m256d => as_f64x4 -> f64x4); -as_transmute!(__m512 => as_f32x16 -> f32x16); -as_transmute!(__m512d => as_f64x8 -> f64x8); +as_transmute!(__m128 => as_m128, as_f32x4 -> f32x4); +as_transmute!(__m128d => as_m128d, as_f64x2 -> f64x2); +as_transmute!(__m256 => as_m256, as_f32x8 -> f32x8); +as_transmute!(__m256d => as_m256d, as_f64x4 -> f64x4); +as_transmute!(__m512 => as_m512, as_f32x16 -> f32x16); +as_transmute!(__m512d => as_m512d, as_f64x8 -> f64x8); as_transmute!(__m128bh => + as_m128bh, as_u16x8 -> u16x8, as_u32x4 -> u32x4, as_i16x8 -> i16x8, as_i32x4 -> i32x4, ); as_transmute!(__m256bh => + as_m256bh, as_u16x16 -> u16x16, as_u32x8 -> u32x8, as_i16x16 -> i16x16, as_i32x8 -> i32x8, ); as_transmute!(__m512bh => + as_m512bh, as_u16x32 -> u16x32, as_u32x16 -> u32x16, as_i16x32 -> i16x32, as_i32x16 -> i32x16, ); -as_transmute!(__m128h => as_f16x8 -> f16x8); -as_transmute!(__m256h => as_f16x16 -> f16x16); -as_transmute!(__m512h => as_f16x32 -> f16x32); +as_transmute!(__m128h => as_m128h, as_f16x8 -> f16x8); +as_transmute!(__m256h => as_m256h, as_f16x16 -> f16x16); +as_transmute!(__m512h => as_m512h, as_f16x32 -> f16x32); mod eflags; #[stable(feature = "simd_x86", since = "1.27.0")] From 9713d4937c4c15806e4604e37d83de231db20052 Mon Sep 17 00:00:00 2001 From: sayantn Date: Mon, 28 Apr 2025 11:32:14 +0530 Subject: [PATCH 3/3] Add `avx512vl` requirement to testsuite for avx512fp16 128 and 256 bit --- crates/core_arch/src/x86/avx512fp16.rs | 40 +++++++++++------------ crates/core_arch/src/x86/test.rs | 4 +-- crates/core_arch/src/x86_64/avx512fp16.rs | 8 ++--- 3 files changed, 26 insertions(+), 26 deletions(-) diff --git a/crates/core_arch/src/x86/avx512fp16.rs b/crates/core_arch/src/x86/avx512fp16.rs index b6d3d75ed0..b674875893 100644 --- a/crates/core_arch/src/x86/avx512fp16.rs +++ b/crates/core_arch/src/x86/avx512fp16.rs @@ -16670,14 +16670,14 @@ mod tests { ) } - #[simd_test(enable = "avx512fp16")] + #[simd_test(enable = "avx512fp16,avx512vl")] unsafe fn test_mm_set_ph() { let r = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); let e = _mm_setr_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0); assert_eq_m128h(r, e); } - #[simd_test(enable = "avx512fp16")] + #[simd_test(enable = "avx512fp16,avx512vl")] unsafe fn test_mm256_set_ph() { let r = _mm256_set_ph( 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, @@ -16703,21 +16703,21 @@ mod tests { assert_eq_m512h(r, e); } - #[simd_test(enable = "avx512fp16")] + #[simd_test(enable = "avx512fp16,avx512vl")] unsafe fn test_mm_set_sh() { let r = _mm_set_sh(1.0); let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0); assert_eq_m128h(r, e); } - #[simd_test(enable = "avx512fp16")] + #[simd_test(enable = "avx512fp16,avx512vl")] unsafe fn test_mm_set1_ph() { let r = _mm_set1_ph(1.0); let e = _mm_set_ph(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0); assert_eq_m128h(r, e); } - #[simd_test(enable = "avx512fp16")] + #[simd_test(enable = "avx512fp16,avx512vl")] unsafe fn test_mm256_set1_ph() { let r = _mm256_set1_ph(1.0); let e = _mm256_set_ph( @@ -16736,14 +16736,14 @@ mod tests { assert_eq_m512h(r, e); } - #[simd_test(enable = "avx512fp16")] + #[simd_test(enable = "avx512fp16,avx512vl")] unsafe fn test_mm_setr_ph() { let r = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); let e = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0); assert_eq_m128h(r, e); } - #[simd_test(enable = "avx512fp16")] + #[simd_test(enable = "avx512fp16,avx512vl")] unsafe fn test_mm256_setr_ph() { let r = _mm256_setr_ph( 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, @@ -16790,7 +16790,7 @@ mod tests { assert_eq_m512h(r, e); } - #[simd_test(enable = "avx512fp16")] + #[simd_test(enable = "avx512fp16,avx512vl")] unsafe fn test_mm_castsi128_ph() { let a = _mm_set1_epi16(0x3c00); let r = _mm_castsi128_ph(a); @@ -16798,7 +16798,7 @@ mod tests { assert_eq_m128h(r, e); } - #[simd_test(enable = "avx512fp16")] + #[simd_test(enable = "avx512fp16,avx512vl")] unsafe fn test_mm256_castsi256_ph() { let a = _mm256_set1_epi16(0x3c00); let r = _mm256_castsi256_ph(a); @@ -16838,7 +16838,7 @@ mod tests { assert_eq_m512i(r, e); } - #[simd_test(enable = "avx512fp16")] + #[simd_test(enable = "avx512fp16,avx512vl")] unsafe fn test_mm_castps_ph() { let a = _mm_castsi128_ps(_mm_set1_epi16(0x3c00)); let r = _mm_castps_ph(a); @@ -16846,7 +16846,7 @@ mod tests { assert_eq_m128h(r, e); } - #[simd_test(enable = "avx512fp16")] + #[simd_test(enable = "avx512fp16,avx512vl")] unsafe fn test_mm256_castps_ph() { let a = _mm256_castsi256_ps(_mm256_set1_epi16(0x3c00)); let r = _mm256_castps_ph(a); @@ -16886,7 +16886,7 @@ mod tests { assert_eq_m512(r, e); } - #[simd_test(enable = "avx512fp16")] + #[simd_test(enable = "avx512fp16,avx512vl")] unsafe fn test_mm_castpd_ph() { let a = _mm_castsi128_pd(_mm_set1_epi16(0x3c00)); let r = _mm_castpd_ph(a); @@ -16894,7 +16894,7 @@ mod tests { assert_eq_m128h(r, e); } - #[simd_test(enable = "avx512fp16")] + #[simd_test(enable = "avx512fp16,avx512vl")] unsafe fn test_mm256_castpd_ph() { let a = _mm256_castsi256_pd(_mm256_set1_epi16(0x3c00)); let r = _mm256_castpd_ph(a); @@ -16934,7 +16934,7 @@ mod tests { assert_eq_m512d(r, e); } - #[simd_test(enable = "avx512fp16")] + #[simd_test(enable = "avx512fp16,avx512vl")] unsafe fn test_mm256_castph256_ph128() { let a = _mm256_setr_ph( 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., @@ -16944,7 +16944,7 @@ mod tests { assert_eq_m128h(r, e); } - #[simd_test(enable = "avx512fp16")] + #[simd_test(enable = "avx512fp16,avx512vl")] unsafe fn test_mm512_castph512_ph128() { let a = _mm512_setr_ph( 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19., @@ -16955,7 +16955,7 @@ mod tests { assert_eq_m128h(r, e); } - #[simd_test(enable = "avx512fp16")] + #[simd_test(enable = "avx512fp16,avx512vl")] unsafe fn test_mm512_castph512_ph256() { let a = _mm512_setr_ph( 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19., @@ -16968,21 +16968,21 @@ mod tests { assert_eq_m256h(r, e); } - #[simd_test(enable = "avx512fp16")] + #[simd_test(enable = "avx512fp16,avx512vl")] unsafe fn test_mm256_castph128_ph256() { let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.); let r = _mm256_castph128_ph256(a); assert_eq_m128h(_mm256_castph256_ph128(r), a); } - #[simd_test(enable = "avx512fp16")] + #[simd_test(enable = "avx512fp16,avx512vl")] unsafe fn test_mm512_castph128_ph512() { let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.); let r = _mm512_castph128_ph512(a); assert_eq_m128h(_mm512_castph512_ph128(r), a); } - #[simd_test(enable = "avx512fp16")] + #[simd_test(enable = "avx512fp16,avx512vl")] unsafe fn test_mm512_castph256_ph512() { let a = _mm256_setr_ph( 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., @@ -16991,7 +16991,7 @@ mod tests { assert_eq_m256h(_mm512_castph512_ph256(r), a); } - #[simd_test(enable = "avx512fp16")] + #[simd_test(enable = "avx512fp16,avx512vl")] unsafe fn test_mm256_zextph128_ph256() { let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.); let r = _mm256_zextph128_ph256(a); diff --git a/crates/core_arch/src/x86/test.rs b/crates/core_arch/src/x86/test.rs index 050a51f9fa..fec25ce2bc 100644 --- a/crates/core_arch/src/x86/test.rs +++ b/crates/core_arch/src/x86/test.rs @@ -37,7 +37,7 @@ pub unsafe fn get_m128(a: __m128, idx: usize) -> f32 { } #[track_caller] -#[target_feature(enable = "avx512fp16")] +#[target_feature(enable = "avx512fp16,avx512vl")] pub unsafe fn assert_eq_m128h(a: __m128h, b: __m128h) { let r = _mm_cmp_ph_mask::<_CMP_EQ_OQ>(a, b); if r != 0b1111_1111 { @@ -87,7 +87,7 @@ pub unsafe fn get_m256(a: __m256, idx: usize) -> f32 { } #[track_caller] -#[target_feature(enable = "avx512fp16")] +#[target_feature(enable = "avx512fp16,avx512vl")] pub unsafe fn assert_eq_m256h(a: __m256h, b: __m256h) { let r = _mm256_cmp_ph_mask::<_CMP_EQ_OQ>(a, b); if r != 0b11111111_11111111 { diff --git a/crates/core_arch/src/x86_64/avx512fp16.rs b/crates/core_arch/src/x86_64/avx512fp16.rs index 69f1dcb5c7..955c6ccc75 100644 --- a/crates/core_arch/src/x86_64/avx512fp16.rs +++ b/crates/core_arch/src/x86_64/avx512fp16.rs @@ -231,7 +231,7 @@ mod tests { use crate::core_arch::{x86::*, x86_64::*}; use stdarch_test::simd_test; - #[simd_test(enable = "avx512fp16")] + #[simd_test(enable = "avx512fp16,avx512vl")] unsafe fn test_mm_cvti64_sh() { let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); let r = _mm_cvti64_sh(a, 10); @@ -239,7 +239,7 @@ mod tests { assert_eq_m128h(r, e); } - #[simd_test(enable = "avx512fp16")] + #[simd_test(enable = "avx512fp16,avx512vl")] unsafe fn test_mm_cvt_roundi64_sh() { let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); let r = _mm_cvt_roundi64_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, 10); @@ -247,7 +247,7 @@ mod tests { assert_eq_m128h(r, e); } - #[simd_test(enable = "avx512fp16")] + #[simd_test(enable = "avx512fp16,avx512vl")] unsafe fn test_mm_cvtu64_sh() { let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); let r = _mm_cvtu64_sh(a, 10); @@ -255,7 +255,7 @@ mod tests { assert_eq_m128h(r, e); } - #[simd_test(enable = "avx512fp16")] + #[simd_test(enable = "avx512fp16,avx512vl")] unsafe fn test_mm_cvt_roundu64_sh() { let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); let r = _mm_cvt_roundu64_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, 10);