From aa5a55e4966b83ba80896623515bdefec987328f Mon Sep 17 00:00:00 2001
From: sayantn <sayantn05@gmail.com>
Date: Mon, 28 Apr 2025 12:02:27 +0530
Subject: [PATCH 1/3] Fix CI errors due to alignment issues in msvc

---
 crates/core_arch/src/x86/avx.rs     | 30 ++++++++--
 crates/core_arch/src/x86/avx512f.rs | 90 +++++++++++++++++++++++------
 crates/core_arch/src/x86/sse.rs     | 32 ++++++++--
 crates/core_arch/src/x86/sse2.rs    | 25 ++++++--
 4 files changed, 142 insertions(+), 35 deletions(-)
diff --git a/crates/core_arch/src/x86/avx.rs b/crates/core_arch/src/x86/avx.rs
index f97bab4994..9a8c08b01b 100644
--- a/crates/core_arch/src/x86/avx.rs
+++ b/crates/core_arch/src/x86/avx.rs
@@ -1425,7 +1425,10 @@ pub fn _mm256_insert_epi32<const INDEX: i32>(a: __m256i, i: i32) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_pd)
 #[inline]
 #[target_feature(enable = "avx")]
-#[cfg_attr(test, assert_instr(vmovap))]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovap)
+)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 #[allow(clippy::cast_ptr_alignment)]
 pub unsafe fn _mm256_load_pd(mem_addr: *const f64) -> __m256d {
@@ -1440,7 +1443,10 @@ pub unsafe fn _mm256_load_pd(mem_addr: *const f64) -> __m256d {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_pd)
 #[inline]
 #[target_feature(enable = "avx")]
-#[cfg_attr(test, assert_instr(vmovap))]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovap)
+)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 #[allow(clippy::cast_ptr_alignment)]
 pub unsafe fn _mm256_store_pd(mem_addr: *mut f64, a: __m256d) {
@@ -1455,7 +1461,10 @@ pub unsafe fn _mm256_store_pd(mem_addr: *mut f64, a: __m256d) {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_ps)
 #[inline]
 #[target_feature(enable = "avx")]
-#[cfg_attr(test, assert_instr(vmovaps))]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 #[allow(clippy::cast_ptr_alignment)]
 pub unsafe fn _mm256_load_ps(mem_addr: *const f32) -> __m256 {
@@ -1470,7 +1479,10 @@ pub unsafe fn _mm256_load_ps(mem_addr: *const f32) -> __m256 {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_ps)
 #[inline]
 #[target_feature(enable = "avx")]
-#[cfg_attr(test, assert_instr(vmovaps))]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 #[allow(clippy::cast_ptr_alignment)]
 pub unsafe fn _mm256_store_ps(mem_addr: *mut f32, a: __m256) {
@@ -1548,7 +1560,10 @@ pub unsafe fn _mm256_storeu_ps(mem_addr: *mut f32, a: __m256) {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_si256)
 #[inline]
 #[target_feature(enable = "avx")]
-#[cfg_attr(test, assert_instr(vmovaps))] // FIXME vmovdqa expected
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)] // FIXME vmovdqa expected
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_load_si256(mem_addr: *const __m256i) -> __m256i {
     *mem_addr
@@ -1561,7 +1576,10 @@ pub unsafe fn _mm256_load_si256(mem_addr: *const __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_si256)
 #[inline]
 #[target_feature(enable = "avx")]
-#[cfg_attr(test, assert_instr(vmovaps))] // FIXME vmovdqa expected
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)] // FIXME vmovdqa expected
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_store_si256(mem_addr: *mut __m256i, a: __m256i) {
     *mem_addr = a;
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index a81b64c383..74a2c5ed68 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -34468,7 +34468,10 @@ pub unsafe fn _mm512_storeu_ps(mem_addr: *mut f32, a: __m512) {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa32
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)] //should be vmovdqa32
 pub unsafe fn _mm512_load_si512(mem_addr: *const __m512i) -> __m512i {
     ptr::read(mem_addr)
 }
@@ -34479,7 +34482,10 @@ pub unsafe fn _mm512_load_si512(mem_addr: *const __m512i) -> __m512i {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa32
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)] //should be vmovdqa32
 pub unsafe fn _mm512_store_si512(mem_addr: *mut __m512i, a: __m512i) {
     ptr::write(mem_addr, a);
 }
@@ -34490,7 +34496,10 @@ pub unsafe fn _mm512_store_si512(mem_addr: *mut __m512i, a: __m512i) {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa32
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)] //should be vmovdqa32
 pub unsafe fn _mm512_load_epi32(mem_addr: *const i32) -> __m512i {
     ptr::read(mem_addr as *const __m512i)
 }
@@ -34501,7 +34510,10 @@ pub unsafe fn _mm512_load_epi32(mem_addr: *const i32) -> __m512i {
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa32
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)] //should be vmovdqa32
 pub unsafe fn _mm256_load_epi32(mem_addr: *const i32) -> __m256i {
     ptr::read(mem_addr as *const __m256i)
 }
@@ -34512,7 +34524,10 @@ pub unsafe fn _mm256_load_epi32(mem_addr: *const i32) -> __m256i {
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa32
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)] //should be vmovdqa32
 pub unsafe fn _mm_load_epi32(mem_addr: *const i32) -> __m128i {
     ptr::read(mem_addr as *const __m128i)
 }
@@ -34523,7 +34538,10 @@ pub unsafe fn _mm_load_epi32(mem_addr: *const i32) -> __m128i {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa32
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)] //should be vmovdqa32
 pub unsafe fn _mm512_store_epi32(mem_addr: *mut i32, a: __m512i) {
     ptr::write(mem_addr as *mut __m512i, a);
 }
@@ -34534,7 +34552,10 @@ pub unsafe fn _mm512_store_epi32(mem_addr: *mut i32, a: __m512i) {
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa32
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)] //should be vmovdqa32
 pub unsafe fn _mm256_store_epi32(mem_addr: *mut i32, a: __m256i) {
     ptr::write(mem_addr as *mut __m256i, a);
 }
@@ -34545,7 +34566,10 @@ pub unsafe fn _mm256_store_epi32(mem_addr: *mut i32, a: __m256i) {
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa32
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)] //should be vmovdqa32
 pub unsafe fn _mm_store_epi32(mem_addr: *mut i32, a: __m128i) {
     ptr::write(mem_addr as *mut __m128i, a);
 }
@@ -34556,7 +34580,10 @@ pub unsafe fn _mm_store_epi32(mem_addr: *mut i32, a: __m128i) {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa64
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)] //should be vmovdqa64
 pub unsafe fn _mm512_load_epi64(mem_addr: *const i64) -> __m512i {
     ptr::read(mem_addr as *const __m512i)
 }
@@ -34567,7 +34594,10 @@ pub unsafe fn _mm512_load_epi64(mem_addr: *const i64) -> __m512i {
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa64
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)] //should be vmovdqa64
 pub unsafe fn _mm256_load_epi64(mem_addr: *const i64) -> __m256i {
     ptr::read(mem_addr as *const __m256i)
 }
@@ -34578,7 +34608,10 @@ pub unsafe fn _mm256_load_epi64(mem_addr: *const i64) -> __m256i {
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa64
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)] //should be vmovdqa64
 pub unsafe fn _mm_load_epi64(mem_addr: *const i64) -> __m128i {
     ptr::read(mem_addr as *const __m128i)
 }
@@ -34589,7 +34622,10 @@ pub unsafe fn _mm_load_epi64(mem_addr: *const i64) -> __m128i {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa64
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)] //should be vmovdqa64
 pub unsafe fn _mm512_store_epi64(mem_addr: *mut i64, a: __m512i) {
     ptr::write(mem_addr as *mut __m512i, a);
 }
@@ -34600,7 +34636,10 @@ pub unsafe fn _mm512_store_epi64(mem_addr: *mut i64, a: __m512i) {
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa64
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)] //should be vmovdqa64
 pub unsafe fn _mm256_store_epi64(mem_addr: *mut i64, a: __m256i) {
     ptr::write(mem_addr as *mut __m256i, a);
 }
@@ -34611,7 +34650,10 @@ pub unsafe fn _mm256_store_epi64(mem_addr: *mut i64, a: __m256i) {
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa64
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)] //should be vmovdqa64
 pub unsafe fn _mm_store_epi64(mem_addr: *mut i64, a: __m128i) {
     ptr::write(mem_addr as *mut __m128i, a);
 }
@@ -34622,7 +34664,10 @@ pub unsafe fn _mm_store_epi64(mem_addr: *mut i64, a: __m128i) {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vmovaps))]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)]
 pub unsafe fn _mm512_load_ps(mem_addr: *const f32) -> __m512 {
     ptr::read(mem_addr as *const __m512)
 }
@@ -34633,7 +34678,10 @@ pub unsafe fn _mm512_load_ps(mem_addr: *const f32) -> __m512 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vmovaps))]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)]
 pub unsafe fn _mm512_store_ps(mem_addr: *mut f32, a: __m512) {
     ptr::write(mem_addr as *mut __m512, a);
 }
@@ -34644,7 +34692,10 @@ pub unsafe fn _mm512_store_ps(mem_addr: *mut f32, a: __m512) {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovapd
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)] //should be vmovapd
 pub unsafe fn _mm512_load_pd(mem_addr: *const f64) -> __m512d {
     ptr::read(mem_addr as *const __m512d)
 }
@@ -34655,7 +34706,10 @@ pub unsafe fn _mm512_load_pd(mem_addr: *const f64) -> __m512d {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovapd
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)] //should be vmovapd
 pub unsafe fn _mm512_store_pd(mem_addr: *mut f64, a: __m512d) {
     ptr::write(mem_addr as *mut __m512d, a);
 }
diff --git a/crates/core_arch/src/x86/sse.rs b/crates/core_arch/src/x86/sse.rs
index c31d6541a9..a845e822fc 100644
--- a/crates/core_arch/src/x86/sse.rs
+++ b/crates/core_arch/src/x86/sse.rs
@@ -1158,7 +1158,12 @@ pub unsafe fn _mm_load_ps1(p: *const f32) -> __m128 {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps)
 #[inline]
 #[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(movaps))]
+// FIXME: Rust doesn't emit alignment attributes for MSVC x86-32. Ref https://github.com/rust-lang/rust/pull/139261
+// All aligned load/store intrinsics are affected
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(movaps)
+)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 #[allow(clippy::cast_ptr_alignment)]
 pub unsafe fn _mm_load_ps(p: *const f32) -> __m128 {
@@ -1213,7 +1218,10 @@ pub unsafe fn _mm_loadu_ps(p: *const f32) -> __m128 {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_ps)
 #[inline]
 #[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(movaps))]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(movaps)
+)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_loadr_ps(p: *const f32) -> __m128 {
     let a = _mm_load_ps(p);
@@ -1253,7 +1261,10 @@ pub unsafe fn _mm_store_ss(p: *mut f32, a: __m128) {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_ps)
 #[inline]
 #[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(movaps))]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(movaps)
+)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 #[allow(clippy::cast_ptr_alignment)]
 pub unsafe fn _mm_store1_ps(p: *mut f32, a: __m128) {
@@ -1266,7 +1277,10 @@ pub unsafe fn _mm_store1_ps(p: *mut f32, a: __m128) {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps1)
 #[inline]
 #[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(movaps))]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(movaps)
+)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_store_ps1(p: *mut f32, a: __m128) {
     _mm_store1_ps(p, a);
@@ -1285,7 +1299,10 @@ pub unsafe fn _mm_store_ps1(p: *mut f32, a: __m128) {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps)
 #[inline]
 #[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(movaps))]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(movaps)
+)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 #[allow(clippy::cast_ptr_alignment)]
 pub unsafe fn _mm_store_ps(p: *mut f32, a: __m128) {
@@ -1329,7 +1346,10 @@ pub unsafe fn _mm_storeu_ps(p: *mut f32, a: __m128) {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_ps)
 #[inline]
 #[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(movaps))]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(movaps)
+)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 #[allow(clippy::cast_ptr_alignment)]
 pub unsafe fn _mm_storer_ps(p: *mut f32, a: __m128) {
diff --git a/crates/core_arch/src/x86/sse2.rs b/crates/core_arch/src/x86/sse2.rs
index aeba43e1da..3dabcde18c 100644
--- a/crates/core_arch/src/x86/sse2.rs
+++ b/crates/core_arch/src/x86/sse2.rs
@@ -1243,7 +1243,10 @@ pub unsafe fn _mm_loadl_epi64(mem_addr: *const __m128i) -> __m128i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_si128)
 #[inline]
 #[target_feature(enable = "sse2")]
-#[cfg_attr(test, assert_instr(movaps))]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(movaps)
+)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_load_si128(mem_addr: *const __m128i) -> __m128i {
     *mem_addr
@@ -1293,7 +1296,10 @@ pub unsafe fn _mm_maskmoveu_si128(a: __m128i, mask: __m128i, mem_addr: *mut i8)
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_si128)
 #[inline]
 #[target_feature(enable = "sse2")]
-#[cfg_attr(test, assert_instr(movaps))]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(movaps)
+)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_store_si128(mem_addr: *mut __m128i, a: __m128i) {
     *mem_addr = a;
@@ -2535,7 +2541,10 @@ pub fn _mm_movemask_pd(a: __m128d) -> i32 {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd)
 #[inline]
 #[target_feature(enable = "sse2")]
-#[cfg_attr(test, assert_instr(movaps))]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(movaps)
+)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 #[allow(clippy::cast_ptr_alignment)]
 pub unsafe fn _mm_load_pd(mem_addr: *const f64) -> __m128d {
@@ -2628,7 +2637,10 @@ pub unsafe fn _mm_store_sd(mem_addr: *mut f64, a: __m128d) {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd)
 #[inline]
 #[target_feature(enable = "sse2")]
-#[cfg_attr(test, assert_instr(movaps))]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(movaps)
+)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 #[allow(clippy::cast_ptr_alignment)]
 pub unsafe fn _mm_store_pd(mem_addr: *mut f64, a: __m128d) {
@@ -2783,7 +2795,10 @@ pub unsafe fn _mm_load_pd1(mem_addr: *const f64) -> __m128d {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_pd)
 #[inline]
 #[target_feature(enable = "sse2")]
-#[cfg_attr(test, assert_instr(movaps))]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(movaps)
+)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_loadr_pd(mem_addr: *const f64) -> __m128d {
     let a = _mm_load_pd(mem_addr);

From 88680a369da551868059294aff62f0503b9a8ccd Mon Sep 17 00:00:00 2001
From: sayantn <sayantn05@gmail.com>
Date: Mon, 28 Apr 2025 11:31:22 +0530
Subject: [PATCH 2/3] Fix errors in decoupling avx512vl and avx512dq from
 avx512fp16

---
 crates/core_arch/src/x86/avx512fp16.rs | 162 ++++++++++++-------------
 crates/core_arch/src/x86/mod.rs        |  36 ++++--
 2 files changed, 106 insertions(+), 92 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512fp16.rs b/crates/core_arch/src/x86/avx512fp16.rs
index d0a4f41dac..b6d3d75ed0 100644
--- a/crates/core_arch/src/x86/avx512fp16.rs
+++ b/crates/core_arch/src/x86/avx512fp16.rs
@@ -249,7 +249,7 @@ pub fn _mm_setzero_ph() -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm256_setzero_ph() -> __m256h {
-    unsafe { transmute(f16x16::ZERO) }
+    f16x16::ZERO.as_m256h()
 }
 
 /// Return vector of type __m512h with all elements set to zero.
@@ -259,7 +259,7 @@ pub fn _mm256_setzero_ph() -> __m256h {
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm512_setzero_ph() -> __m512h {
-    unsafe { transmute(f16x32::ZERO) }
+    f16x32::ZERO.as_m512h()
 }
 
 /// Return vector of type `__m128h` with indetermination elements.
@@ -272,7 +272,7 @@ pub fn _mm512_setzero_ph() -> __m512h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_undefined_ph() -> __m128h {
-    unsafe { transmute(f16x8::ZERO) }
+    f16x8::ZERO.as_m128h()
 }
 
 /// Return vector of type `__m256h` with indetermination elements.
@@ -285,7 +285,7 @@ pub fn _mm_undefined_ph() -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm256_undefined_ph() -> __m256h {
-    unsafe { transmute(f16x16::ZERO) }
+    f16x16::ZERO.as_m256h()
 }
 
 /// Return vector of type `__m512h` with indetermination elements.
@@ -298,7 +298,7 @@ pub fn _mm256_undefined_ph() -> __m256h {
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm512_undefined_ph() -> __m512h {
-    unsafe { transmute(f16x32::ZERO) }
+    f16x32::ZERO.as_m512h()
 }
 
 /// Cast vector of type `__m128d` to type `__m128h`. This intrinsic is only used for compilation and
@@ -1552,7 +1552,7 @@ pub fn _mm512_maskz_add_round_ph<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_add_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
     static_assert_rounding!(ROUNDING);
-    _mm_mask_add_round_sh::<ROUNDING>(_mm_undefined_ph(), 0xff, a, b)
+    _mm_mask_add_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
 }
 
 /// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
@@ -1603,7 +1603,7 @@ pub fn _mm_mask_add_round_sh<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_maskz_add_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
     static_assert_rounding!(ROUNDING);
-    _mm_mask_add_round_sh::<ROUNDING>(_mm_setzero_ph(), k, a, b)
+    _mm_mask_add_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
 }
 
 /// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
@@ -1864,7 +1864,7 @@ pub fn _mm512_maskz_sub_round_ph<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_sub_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
     static_assert_rounding!(ROUNDING);
-    _mm_mask_sub_round_sh::<ROUNDING>(_mm_undefined_ph(), 0xff, a, b)
+    _mm_mask_sub_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
 }
 
 /// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
@@ -1915,7 +1915,7 @@ pub fn _mm_mask_sub_round_sh<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_maskz_sub_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
     static_assert_rounding!(ROUNDING);
-    _mm_mask_sub_round_sh::<ROUNDING>(_mm_setzero_ph(), k, a, b)
+    _mm_mask_sub_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
 }
 
 /// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
@@ -2176,7 +2176,7 @@ pub fn _mm512_maskz_mul_round_ph<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_mul_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
     static_assert_rounding!(ROUNDING);
-    _mm_mask_mul_round_sh::<ROUNDING>(_mm_undefined_ph(), 0xff, a, b)
+    _mm_mask_mul_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
 }
 
 /// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
@@ -2227,7 +2227,7 @@ pub fn _mm_mask_mul_round_sh<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_maskz_mul_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
     static_assert_rounding!(ROUNDING);
-    _mm_mask_mul_round_sh::<ROUNDING>(_mm_setzero_ph(), k, a, b)
+    _mm_mask_mul_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
 }
 
 /// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
@@ -2488,7 +2488,7 @@ pub fn _mm512_maskz_div_round_ph<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_div_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
     static_assert_rounding!(ROUNDING);
-    _mm_mask_div_round_sh::<ROUNDING>(_mm_undefined_ph(), 0xff, a, b)
+    _mm_mask_div_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
 }
 
 /// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
@@ -2539,7 +2539,7 @@ pub fn _mm_mask_div_round_sh<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_maskz_div_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
     static_assert_rounding!(ROUNDING);
-    _mm_mask_div_round_sh::<ROUNDING>(_mm_setzero_ph(), k, a, b)
+    _mm_mask_div_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
 }
 
 /// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
@@ -2794,7 +2794,7 @@ pub fn _mm512_maskz_mul_round_pch<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vfmulcsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_mul_sch(a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_mul_sch(_mm_undefined_ph(), 0xff, a, b)
+    _mm_mask_mul_sch(f16x8::ZERO.as_m128h(), 0xff, a, b)
 }
 
 /// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst using
@@ -2822,7 +2822,7 @@ pub fn _mm_mask_mul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __
 #[cfg_attr(test, assert_instr(vfmulcsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_maskz_mul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_mul_sch(_mm_setzero_ph(), k, a, b)
+    _mm_mask_mul_sch(f16x8::ZERO.as_m128h(), k, a, b)
 }
 
 /// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst,
@@ -2846,7 +2846,7 @@ pub fn _mm_maskz_mul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_mul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
     static_assert_rounding!(ROUNDING);
-    _mm_mask_mul_round_sch::<ROUNDING>(_mm_undefined_ph(), 0xff, a, b)
+    _mm_mask_mul_round_sch::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
 }
 
 /// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst using
@@ -2911,7 +2911,7 @@ pub fn _mm_maskz_mul_round_sch<const ROUNDING: i32>(
     b: __m128h,
 ) -> __m128h {
     static_assert_rounding!(ROUNDING);
-    _mm_mask_mul_round_sch::<ROUNDING>(_mm_setzero_ph(), k, a, b)
+    _mm_mask_mul_round_sch::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
 }
 
 /// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
@@ -3445,7 +3445,7 @@ pub fn _mm512_maskz_cmul_round_pch<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vfcmulcsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_cmul_sch(a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_cmul_sch(_mm_undefined_ph(), 0xff, a, b)
+    _mm_mask_cmul_sch(f16x8::ZERO.as_m128h(), 0xff, a, b)
 }
 
 /// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
@@ -3473,7 +3473,7 @@ pub fn _mm_mask_cmul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> _
 #[cfg_attr(test, assert_instr(vfcmulcsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_maskz_cmul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_cmul_sch(_mm_setzero_ph(), k, a, b)
+    _mm_mask_cmul_sch(f16x8::ZERO.as_m128h(), k, a, b)
 }
 
 /// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
@@ -3496,7 +3496,7 @@ pub fn _mm_maskz_cmul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_cmul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
     static_assert_rounding!(ROUNDING);
-    _mm_mask_cmul_round_sch::<ROUNDING>(_mm_undefined_ph(), 0xff, a, b)
+    _mm_mask_cmul_round_sch::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
 }
 
 /// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
@@ -3561,7 +3561,7 @@ pub fn _mm_maskz_cmul_round_sch<const ROUNDING: i32>(
     b: __m128h,
 ) -> __m128h {
     static_assert_rounding!(ROUNDING);
-    _mm_mask_cmul_round_sch::<ROUNDING>(_mm_setzero_ph(), k, a, b)
+    _mm_mask_cmul_round_sch::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
 }
 
 /// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
@@ -7782,7 +7782,7 @@ pub fn _mm512_maskz_rcp_ph(k: __mmask32, a: __m512h) -> __m512h {
 #[cfg_attr(test, assert_instr(vrcpsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_rcp_sh(a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_rcp_sh(_mm_undefined_ph(), 0xff, a, b)
+    _mm_mask_rcp_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
 }
 
 /// Compute the approximate reciprocal of the lower half-precision (16-bit) floating-point element in b,
@@ -7810,7 +7810,7 @@ pub fn _mm_mask_rcp_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m
 #[cfg_attr(test, assert_instr(vrcpsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_maskz_rcp_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_rcp_sh(_mm_setzero_ph(), k, a, b)
+    _mm_mask_rcp_sh(f16x8::ZERO.as_m128h(), k, a, b)
 }
 
 /// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
@@ -7947,7 +7947,7 @@ pub fn _mm512_maskz_rsqrt_ph(k: __mmask32, a: __m512h) -> __m512h {
 #[cfg_attr(test, assert_instr(vrsqrtsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_rsqrt_sh(a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_rsqrt_sh(_mm_undefined_ph(), 0xff, a, b)
+    _mm_mask_rsqrt_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
 }
 
 /// Compute the approximate reciprocal square root of the lower half-precision (16-bit) floating-point
@@ -7975,7 +7975,7 @@ pub fn _mm_mask_rsqrt_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> _
 #[cfg_attr(test, assert_instr(vrsqrtsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_maskz_rsqrt_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_rsqrt_sh(_mm_setzero_ph(), k, a, b)
+    _mm_mask_rsqrt_sh(f16x8::ZERO.as_m128h(), k, a, b)
 }
 
 /// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
@@ -8169,7 +8169,7 @@ pub fn _mm512_maskz_sqrt_round_ph<const ROUNDING: i32>(k: __mmask32, a: __m512h)
 #[cfg_attr(test, assert_instr(vsqrtsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_sqrt_sh(a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_sqrt_sh(_mm_undefined_ph(), 0xff, a, b)
+    _mm_mask_sqrt_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
 }
 
 /// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
@@ -8195,7 +8195,7 @@ pub fn _mm_mask_sqrt_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __
 #[cfg_attr(test, assert_instr(vsqrtsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_maskz_sqrt_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_sqrt_sh(_mm_setzero_ph(), k, a, b)
+    _mm_mask_sqrt_sh(f16x8::ZERO.as_m128h(), k, a, b)
 }
 
 /// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
@@ -8217,7 +8217,7 @@ pub fn _mm_maskz_sqrt_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_sqrt_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
     static_assert_rounding!(ROUNDING);
-    _mm_mask_sqrt_round_sh::<ROUNDING>(_mm_undefined_ph(), 0xff, a, b)
+    _mm_mask_sqrt_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
 }
 
 /// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
@@ -8272,7 +8272,7 @@ pub fn _mm_maskz_sqrt_round_sh<const ROUNDING: i32>(
     b: __m128h,
 ) -> __m128h {
     static_assert_rounding!(ROUNDING);
-    _mm_mask_sqrt_round_sh::<ROUNDING>(_mm_setzero_ph(), k, a, b)
+    _mm_mask_sqrt_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
 }
 
 /// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
@@ -8496,7 +8496,7 @@ pub fn _mm_mask_max_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m
 #[cfg_attr(test, assert_instr(vmaxsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_maskz_max_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_max_sh(_mm_setzero_ph(), k, a, b)
+    _mm_mask_max_sh(f16x8::ZERO.as_m128h(), k, a, b)
 }
 
 /// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum value
@@ -8553,7 +8553,7 @@ pub fn _mm_mask_max_round_sh<const SAE: i32>(
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_maskz_max_round_sh<const SAE: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
     static_assert_sae!(SAE);
-    _mm_mask_max_round_sh::<SAE>(_mm_setzero_ph(), k, a, b)
+    _mm_mask_max_round_sh::<SAE>(f16x8::ZERO.as_m128h(), k, a, b)
 }
 
 /// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
@@ -8776,7 +8776,7 @@ pub fn _mm_mask_min_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m
 #[cfg_attr(test, assert_instr(vminsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_maskz_min_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_min_sh(_mm_setzero_ph(), k, a, b)
+    _mm_mask_min_sh(f16x8::ZERO.as_m128h(), k, a, b)
 }
 
 /// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum value
@@ -8833,7 +8833,7 @@ pub fn _mm_mask_min_round_sh<const SAE: i32>(
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_maskz_min_round_sh<const SAE: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
     static_assert_sae!(SAE);
-    _mm_mask_min_round_sh::<SAE>(_mm_setzero_ph(), k, a, b)
+    _mm_mask_min_round_sh::<SAE>(f16x8::ZERO.as_m128h(), k, a, b)
 }
 
 /// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
@@ -9024,7 +9024,7 @@ pub fn _mm512_maskz_getexp_round_ph<const SAE: i32>(k: __mmask32, a: __m512h) ->
 #[cfg_attr(test, assert_instr(vgetexpsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_getexp_sh(a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_getexp_sh(_mm_undefined_ph(), 0xff, a, b)
+    _mm_mask_getexp_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
 }
 
 /// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
@@ -9054,7 +9054,7 @@ pub fn _mm_mask_getexp_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) ->
 #[cfg_attr(test, assert_instr(vgetexpsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_maskz_getexp_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_getexp_sh(_mm_setzero_ph(), k, a, b)
+    _mm_mask_getexp_sh(f16x8::ZERO.as_m128h(), k, a, b)
 }
 
 /// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
@@ -9071,7 +9071,7 @@ pub fn _mm_maskz_getexp_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_getexp_round_sh<const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
     static_assert_sae!(SAE);
-    _mm_mask_getexp_round_sh::<SAE>(_mm_undefined_ph(), 0xff, a, b)
+    _mm_mask_getexp_round_sh::<SAE>(f16x8::ZERO.as_m128h(), 0xff, a, b)
 }
 
 /// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
@@ -9112,7 +9112,7 @@ pub fn _mm_mask_getexp_round_sh<const SAE: i32>(
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_maskz_getexp_round_sh<const SAE: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
     static_assert_sae!(SAE);
-    _mm_mask_getexp_round_sh::<SAE>(_mm_setzero_ph(), k, a, b)
+    _mm_mask_getexp_round_sh::<SAE>(f16x8::ZERO.as_m128h(), k, a, b)
 }
 
 /// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
@@ -9585,7 +9585,7 @@ pub fn _mm_getmant_sh<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTIS
 ) -> __m128h {
     static_assert_uimm_bits!(NORM, 4);
     static_assert_uimm_bits!(SIGN, 2);
-    _mm_mask_getmant_sh::<NORM, SIGN>(_mm_undefined_ph(), 0xff, a, b)
+    _mm_mask_getmant_sh::<NORM, SIGN>(f16x8::ZERO.as_m128h(), 0xff, a, b)
 }
 
 /// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
@@ -9662,7 +9662,7 @@ pub fn _mm_maskz_getmant_sh<
 ) -> __m128h {
     static_assert_uimm_bits!(NORM, 4);
     static_assert_uimm_bits!(SIGN, 2);
-    _mm_mask_getmant_sh::<NORM, SIGN>(_mm_setzero_ph(), k, a, b)
+    _mm_mask_getmant_sh::<NORM, SIGN>(f16x8::ZERO.as_m128h(), k, a, b)
 }
 
 /// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
@@ -9703,7 +9703,7 @@ pub fn _mm_getmant_round_sh<
     static_assert_uimm_bits!(NORM, 4);
     static_assert_uimm_bits!(SIGN, 2);
     static_assert_sae!(SAE);
-    _mm_mask_getmant_round_sh::<NORM, SIGN, SAE>(_mm_undefined_ph(), 0xff, a, b)
+    _mm_mask_getmant_round_sh::<NORM, SIGN, SAE>(f16x8::ZERO.as_m128h(), 0xff, a, b)
 }
 
 /// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
@@ -9790,7 +9790,7 @@ pub fn _mm_maskz_getmant_round_sh<
     static_assert_uimm_bits!(NORM, 4);
     static_assert_uimm_bits!(SIGN, 2);
     static_assert_sae!(SAE);
-    _mm_mask_getmant_round_sh::<NORM, SIGN, SAE>(_mm_setzero_ph(), k, a, b)
+    _mm_mask_getmant_round_sh::<NORM, SIGN, SAE>(f16x8::ZERO.as_m128h(), k, a, b)
 }
 
 /// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
@@ -10111,7 +10111,7 @@ pub fn _mm512_maskz_roundscale_round_ph<const IMM8: i32, const SAE: i32>(
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_roundscale_sh<const IMM8: i32>(a: __m128h, b: __m128h) -> __m128h {
     static_assert_uimm_bits!(IMM8, 8);
-    _mm_mask_roundscale_sh::<IMM8>(_mm_undefined_ph(), 0xff, a, b)
+    _mm_mask_roundscale_sh::<IMM8>(f16x8::ZERO.as_m128h(), 0xff, a, b)
 }
 
 /// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
@@ -10162,7 +10162,7 @@ pub fn _mm_mask_roundscale_sh<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_maskz_roundscale_sh<const IMM8: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
     static_assert_uimm_bits!(IMM8, 8);
-    _mm_mask_roundscale_sh::<IMM8>(_mm_setzero_ph(), k, a, b)
+    _mm_mask_roundscale_sh::<IMM8>(f16x8::ZERO.as_m128h(), k, a, b)
 }
 
 /// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
@@ -10188,7 +10188,7 @@ pub fn _mm_maskz_roundscale_sh<const IMM8: i32>(k: __mmask8, a: __m128h, b: __m1
 pub fn _mm_roundscale_round_sh<const IMM8: i32, const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
     static_assert_uimm_bits!(IMM8, 8);
     static_assert_sae!(SAE);
-    _mm_mask_roundscale_round_sh::<IMM8, SAE>(_mm_undefined_ph(), 0xff, a, b)
+    _mm_mask_roundscale_round_sh::<IMM8, SAE>(f16x8::ZERO.as_m128h(), 0xff, a, b)
 }
 
 /// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
@@ -10251,7 +10251,7 @@ pub fn _mm_maskz_roundscale_round_sh<const IMM8: i32, const SAE: i32>(
 ) -> __m128h {
     static_assert_uimm_bits!(IMM8, 8);
     static_assert_sae!(SAE);
-    _mm_mask_roundscale_round_sh::<IMM8, SAE>(_mm_setzero_ph(), k, a, b)
+    _mm_mask_roundscale_round_sh::<IMM8, SAE>(f16x8::ZERO.as_m128h(), k, a, b)
 }
 
 /// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
@@ -10449,7 +10449,7 @@ pub fn _mm512_maskz_scalef_round_ph<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vscalefsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_scalef_sh(a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_scalef_sh(_mm_undefined_ph(), 0xff, a, b)
+    _mm_mask_scalef_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
 }
 
 /// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
@@ -10475,7 +10475,7 @@ pub fn _mm_mask_scalef_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) ->
 #[cfg_attr(test, assert_instr(vscalefsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_maskz_scalef_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_scalef_sh(_mm_setzero_ph(), k, a, b)
+    _mm_mask_scalef_sh(f16x8::ZERO.as_m128h(), k, a, b)
 }
 
 /// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
@@ -10498,7 +10498,7 @@ pub fn _mm_maskz_scalef_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_scalef_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
     static_assert_rounding!(ROUNDING);
-    _mm_mask_scalef_round_sh::<ROUNDING>(_mm_undefined_ph(), 0xff, a, b)
+    _mm_mask_scalef_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
 }
 
 /// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
@@ -10555,7 +10555,7 @@ pub fn _mm_maskz_scalef_round_sh<const ROUNDING: i32>(
     b: __m128h,
 ) -> __m128h {
     static_assert_rounding!(ROUNDING);
-    _mm_mask_scalef_round_sh::<ROUNDING>(_mm_setzero_ph(), k, a, b)
+    _mm_mask_scalef_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
 }
 
 /// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
@@ -10872,7 +10872,7 @@ pub fn _mm512_maskz_reduce_round_ph<const IMM8: i32, const SAE: i32>(
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_reduce_sh<const IMM8: i32>(a: __m128h, b: __m128h) -> __m128h {
     static_assert_uimm_bits!(IMM8, 8);
-    _mm_mask_reduce_sh::<IMM8>(_mm_undefined_ph(), 0xff, a, b)
+    _mm_mask_reduce_sh::<IMM8>(f16x8::ZERO.as_m128h(), 0xff, a, b)
 }
 
 /// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
@@ -10925,7 +10925,7 @@ pub fn _mm_mask_reduce_sh<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_maskz_reduce_sh<const IMM8: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
     static_assert_uimm_bits!(IMM8, 8);
-    _mm_mask_reduce_sh::<IMM8>(_mm_setzero_ph(), k, a, b)
+    _mm_mask_reduce_sh::<IMM8>(f16x8::ZERO.as_m128h(), k, a, b)
 }
 
 /// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
@@ -10951,7 +10951,7 @@ pub fn _mm_maskz_reduce_sh<const IMM8: i32>(k: __mmask8, a: __m128h, b: __m128h)
 pub fn _mm_reduce_round_sh<const IMM8: i32, const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
     static_assert_uimm_bits!(IMM8, 8);
     static_assert_sae!(SAE);
-    _mm_mask_reduce_round_sh::<IMM8, SAE>(_mm_undefined_ph(), 0xff, a, b)
+    _mm_mask_reduce_round_sh::<IMM8, SAE>(f16x8::ZERO.as_m128h(), 0xff, a, b)
 }
 
 /// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
@@ -11016,7 +11016,7 @@ pub fn _mm_maskz_reduce_round_sh<const IMM8: i32, const SAE: i32>(
 ) -> __m128h {
     static_assert_uimm_bits!(IMM8, 8);
     static_assert_sae!(SAE);
-    _mm_mask_reduce_round_sh::<IMM8, SAE>(_mm_setzero_ph(), k, a, b)
+    _mm_mask_reduce_round_sh::<IMM8, SAE>(f16x8::ZERO.as_m128h(), k, a, b)
 }
 
 /// Reduce the packed half-precision (16-bit) floating-point elements in a by addition. Returns the
@@ -12060,7 +12060,7 @@ pub fn _mm512_mask_cvtepi32_ph(src: __m256h, k: __mmask16, a: __m512i) -> __m256
 #[cfg_attr(test, assert_instr(vcvtdq2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm512_maskz_cvtepi32_ph(k: __mmask16, a: __m512i) -> __m256h {
-    _mm512_mask_cvtepi32_ph(_mm256_setzero_ph(), k, a)
+    _mm512_mask_cvtepi32_ph(f16x16::ZERO.as_m256h(), k, a)
 }
 
 /// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
@@ -12135,7 +12135,7 @@ pub fn _mm512_mask_cvt_roundepi32_ph<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm512_maskz_cvt_roundepi32_ph<const ROUNDING: i32>(k: __mmask16, a: __m512i) -> __m256h {
     static_assert_rounding!(ROUNDING);
-    _mm512_mask_cvt_roundepi32_ph::<ROUNDING>(_mm256_setzero_ph(), k, a)
+    _mm512_mask_cvt_roundepi32_ph::<ROUNDING>(f16x16::ZERO.as_m256h(), k, a)
 }
 
 /// Convert the signed 32-bit integer b to a half-precision (16-bit) floating-point element, store the
@@ -12285,7 +12285,7 @@ pub fn _mm512_mask_cvtepu32_ph(src: __m256h, k: __mmask16, a: __m512i) -> __m256
 #[cfg_attr(test, assert_instr(vcvtudq2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm512_maskz_cvtepu32_ph(k: __mmask16, a: __m512i) -> __m256h {
-    _mm512_mask_cvtepu32_ph(_mm256_setzero_ph(), k, a)
+    _mm512_mask_cvtepu32_ph(f16x16::ZERO.as_m256h(), k, a)
 }
 
 /// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
@@ -12360,7 +12360,7 @@ pub fn _mm512_mask_cvt_roundepu32_ph<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm512_maskz_cvt_roundepu32_ph<const ROUNDING: i32>(k: __mmask16, a: __m512i) -> __m256h {
     static_assert_rounding!(ROUNDING);
-    _mm512_mask_cvt_roundepu32_ph::<ROUNDING>(_mm256_setzero_ph(), k, a)
+    _mm512_mask_cvt_roundepu32_ph::<ROUNDING>(f16x16::ZERO.as_m256h(), k, a)
 }
 
 /// Convert the unsigned 32-bit integer b to a half-precision (16-bit) floating-point element, store the
@@ -12511,7 +12511,7 @@ pub fn _mm512_mask_cvtepi64_ph(src: __m128h, k: __mmask8, a: __m512i) -> __m128h
 #[cfg_attr(test, assert_instr(vcvtqq2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm512_maskz_cvtepi64_ph(k: __mmask8, a: __m512i) -> __m128h {
-    _mm512_mask_cvtepi64_ph(_mm_setzero_ph(), k, a)
+    _mm512_mask_cvtepi64_ph(f16x8::ZERO.as_m128h(), k, a)
 }
 
 /// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
@@ -12586,7 +12586,7 @@ pub fn _mm512_mask_cvt_roundepi64_ph<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm512_maskz_cvt_roundepi64_ph<const ROUNDING: i32>(k: __mmask8, a: __m512i) -> __m128h {
     static_assert_rounding!(ROUNDING);
-    _mm512_mask_cvt_roundepi64_ph::<ROUNDING>(_mm_setzero_ph(), k, a)
+    _mm512_mask_cvt_roundepi64_ph::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a)
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
@@ -12699,7 +12699,7 @@ pub fn _mm512_mask_cvtepu64_ph(src: __m128h, k: __mmask8, a: __m512i) -> __m128h
 #[cfg_attr(test, assert_instr(vcvtuqq2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm512_maskz_cvtepu64_ph(k: __mmask8, a: __m512i) -> __m128h {
-    _mm512_mask_cvtepu64_ph(_mm_setzero_ph(), k, a)
+    _mm512_mask_cvtepu64_ph(f16x8::ZERO.as_m128h(), k, a)
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
@@ -12774,7 +12774,7 @@ pub fn _mm512_mask_cvt_roundepu64_ph<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm512_maskz_cvt_roundepu64_ph<const ROUNDING: i32>(k: __mmask8, a: __m512i) -> __m128h {
     static_assert_rounding!(ROUNDING);
-    _mm512_mask_cvt_roundepu64_ph::<ROUNDING>(_mm_setzero_ph(), k, a)
+    _mm512_mask_cvt_roundepu64_ph::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a)
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
@@ -12862,7 +12862,7 @@ pub fn _mm256_maskz_cvtxps_ph(k: __mmask8, a: __m256) -> __m128h {
 #[cfg_attr(test, assert_instr(vcvtps2phx))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm512_cvtxps_ph(a: __m512) -> __m256h {
-    _mm512_mask_cvtxps_ph(_mm256_setzero_ph(), 0xffff, a)
+    _mm512_mask_cvtxps_ph(f16x16::ZERO.as_m256h(), 0xffff, a)
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
@@ -12888,7 +12888,7 @@ pub fn _mm512_mask_cvtxps_ph(src: __m256h, k: __mmask16, a: __m512) -> __m256h {
 #[cfg_attr(test, assert_instr(vcvtps2phx))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm512_maskz_cvtxps_ph(k: __mmask16, a: __m512) -> __m256h {
-    _mm512_mask_cvtxps_ph(_mm256_setzero_ph(), k, a)
+    _mm512_mask_cvtxps_ph(f16x16::ZERO.as_m256h(), k, a)
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
@@ -12910,7 +12910,7 @@ pub fn _mm512_maskz_cvtxps_ph(k: __mmask16, a: __m512) -> __m256h {
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm512_cvtx_roundps_ph<const ROUNDING: i32>(a: __m512) -> __m256h {
     static_assert_rounding!(ROUNDING);
-    _mm512_mask_cvtx_roundps_ph::<ROUNDING>(_mm256_setzero_ph(), 0xffff, a)
+    _mm512_mask_cvtx_roundps_ph::<ROUNDING>(f16x16::ZERO.as_m256h(), 0xffff, a)
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
@@ -12962,7 +12962,7 @@ pub fn _mm512_mask_cvtx_roundps_ph<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm512_maskz_cvtx_roundps_ph<const ROUNDING: i32>(k: __mmask16, a: __m512) -> __m256h {
     static_assert_rounding!(ROUNDING);
-    _mm512_mask_cvtx_roundps_ph::<ROUNDING>(_mm256_setzero_ph(), k, a)
+    _mm512_mask_cvtx_roundps_ph::<ROUNDING>(f16x16::ZERO.as_m256h(), k, a)
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
@@ -12975,7 +12975,7 @@ pub fn _mm512_maskz_cvtx_roundps_ph<const ROUNDING: i32>(k: __mmask16, a: __m512
 #[cfg_attr(test, assert_instr(vcvtss2sh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_cvtss_sh(a: __m128h, b: __m128) -> __m128h {
-    _mm_mask_cvtss_sh(_mm_undefined_ph(), 0xff, a, b)
+    _mm_mask_cvtss_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
@@ -13003,7 +13003,7 @@ pub fn _mm_mask_cvtss_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128) -> __
 #[cfg_attr(test, assert_instr(vcvtss2sh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_maskz_cvtss_sh(k: __mmask8, a: __m128h, b: __m128) -> __m128h {
-    _mm_mask_cvtss_sh(_mm_setzero_ph(), k, a, b)
+    _mm_mask_cvtss_sh(f16x8::ZERO.as_m128h(), k, a, b)
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
@@ -13026,7 +13026,7 @@ pub fn _mm_maskz_cvtss_sh(k: __mmask8, a: __m128h, b: __m128) -> __m128h {
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_cvt_roundss_sh<const ROUNDING: i32>(a: __m128h, b: __m128) -> __m128h {
     static_assert_rounding!(ROUNDING);
-    _mm_mask_cvt_roundss_sh::<ROUNDING>(_mm_undefined_ph(), 0xff, a, b)
+    _mm_mask_cvt_roundss_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
@@ -13085,7 +13085,7 @@ pub fn _mm_maskz_cvt_roundss_sh<const ROUNDING: i32>(
     b: __m128,
 ) -> __m128h {
     static_assert_rounding!(ROUNDING);
-    _mm_mask_cvt_roundss_sh::<ROUNDING>(_mm_setzero_ph(), k, a, b)
+    _mm_mask_cvt_roundss_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
@@ -13173,7 +13173,7 @@ pub fn _mm256_maskz_cvtpd_ph(k: __mmask8, a: __m256d) -> __m128h {
 #[cfg_attr(test, assert_instr(vcvtpd2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm512_cvtpd_ph(a: __m512d) -> __m128h {
-    _mm512_mask_cvtpd_ph(_mm_setzero_ph(), 0xff, a)
+    _mm512_mask_cvtpd_ph(f16x8::ZERO.as_m128h(), 0xff, a)
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
@@ -13199,7 +13199,7 @@ pub fn _mm512_mask_cvtpd_ph(src: __m128h, k: __mmask8, a: __m512d) -> __m128h {
 #[cfg_attr(test, assert_instr(vcvtpd2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm512_maskz_cvtpd_ph(k: __mmask8, a: __m512d) -> __m128h {
-    _mm512_mask_cvtpd_ph(_mm_setzero_ph(), k, a)
+    _mm512_mask_cvtpd_ph(f16x8::ZERO.as_m128h(), k, a)
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
@@ -13221,7 +13221,7 @@ pub fn _mm512_maskz_cvtpd_ph(k: __mmask8, a: __m512d) -> __m128h {
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm512_cvt_roundpd_ph<const ROUNDING: i32>(a: __m512d) -> __m128h {
     static_assert_rounding!(ROUNDING);
-    _mm512_mask_cvt_roundpd_ph::<ROUNDING>(_mm_setzero_ph(), 0xff, a)
+    _mm512_mask_cvt_roundpd_ph::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a)
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
@@ -13273,7 +13273,7 @@ pub fn _mm512_mask_cvt_roundpd_ph<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm512_maskz_cvt_roundpd_ph<const ROUNDING: i32>(k: __mmask8, a: __m512d) -> __m128h {
     static_assert_rounding!(ROUNDING);
-    _mm512_mask_cvt_roundpd_ph::<ROUNDING>(_mm_setzero_ph(), k, a)
+    _mm512_mask_cvt_roundpd_ph::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a)
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
@@ -13286,7 +13286,7 @@ pub fn _mm512_maskz_cvt_roundpd_ph<const ROUNDING: i32>(k: __mmask8, a: __m512d)
 #[cfg_attr(test, assert_instr(vcvtsd2sh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_cvtsd_sh(a: __m128h, b: __m128d) -> __m128h {
-    _mm_mask_cvtsd_sh(_mm_undefined_ph(), 0xff, a, b)
+    _mm_mask_cvtsd_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
@@ -13314,7 +13314,7 @@ pub fn _mm_mask_cvtsd_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128d) -> _
 #[cfg_attr(test, assert_instr(vcvtsd2sh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_maskz_cvtsd_sh(k: __mmask8, a: __m128h, b: __m128d) -> __m128h {
-    _mm_mask_cvtsd_sh(_mm_setzero_ph(), k, a, b)
+    _mm_mask_cvtsd_sh(f16x8::ZERO.as_m128h(), k, a, b)
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
@@ -13337,7 +13337,7 @@ pub fn _mm_maskz_cvtsd_sh(k: __mmask8, a: __m128h, b: __m128d) -> __m128h {
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_cvt_roundsd_sh<const ROUNDING: i32>(a: __m128h, b: __m128d) -> __m128h {
     static_assert_rounding!(ROUNDING);
-    _mm_mask_cvt_roundsd_sh::<ROUNDING>(_mm_undefined_ph(), 0xff, a, b)
+    _mm_mask_cvt_roundsd_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
@@ -13396,7 +13396,7 @@ pub fn _mm_maskz_cvt_roundsd_sh<const ROUNDING: i32>(
     b: __m128d,
 ) -> __m128h {
     static_assert_rounding!(ROUNDING);
-    _mm_mask_cvt_roundsd_sh::<ROUNDING>(_mm_setzero_ph(), k, a, b)
+    _mm_mask_cvt_roundsd_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
@@ -15899,7 +15899,7 @@ pub fn _mm_mask_cvtsh_ss(src: __m128, k: __mmask8, a: __m128, b: __m128h) -> __m
 #[cfg_attr(test, assert_instr(vcvtsh2ss))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_maskz_cvtsh_ss(k: __mmask8, a: __m128, b: __m128h) -> __m128 {
-    _mm_mask_cvtsh_ss(_mm_setzero_ps(), k, a, b)
+    _mm_mask_cvtsh_ss(_mm_set_ss(0.0), k, a, b)
 }
 
 /// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
@@ -15959,7 +15959,7 @@ pub fn _mm_mask_cvt_roundsh_ss<const SAE: i32>(
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_maskz_cvt_roundsh_ss<const SAE: i32>(k: __mmask8, a: __m128, b: __m128h) -> __m128 {
     static_assert_sae!(SAE);
-    _mm_mask_cvt_roundsh_ss::<SAE>(_mm_setzero_ps(), k, a, b)
+    _mm_mask_cvt_roundsh_ss::<SAE>(_mm_set_ss(0.0), k, a, b)
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
@@ -16169,7 +16169,7 @@ pub fn _mm_mask_cvtsh_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128h) -> _
 #[cfg_attr(test, assert_instr(vcvtsh2sd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_maskz_cvtsh_sd(k: __mmask8, a: __m128d, b: __m128h) -> __m128d {
-    _mm_mask_cvtsh_sd(_mm_setzero_pd(), k, a, b)
+    _mm_mask_cvtsh_sd(_mm_set_sd(0.0), k, a, b)
 }
 
 /// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
@@ -16228,7 +16228,7 @@ pub fn _mm_mask_cvt_roundsh_sd<const SAE: i32>(
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_maskz_cvt_roundsh_sd<const SAE: i32>(k: __mmask8, a: __m128d, b: __m128h) -> __m128d {
     static_assert_sae!(SAE);
-    _mm_mask_cvt_roundsh_sd::<SAE>(_mm_setzero_pd(), k, a, b)
+    _mm_mask_cvt_roundsh_sd::<SAE>(_mm_set_sd(0.0), k, a, b)
 }
 
 /// Copy the lower half-precision (16-bit) floating-point element from `a` to `dst`.
diff --git a/crates/core_arch/src/x86/mod.rs b/crates/core_arch/src/x86/mod.rs
index 68cf9b5960..d59b120fa5 100644
--- a/crates/core_arch/src/x86/mod.rs
+++ b/crates/core_arch/src/x86/mod.rs
@@ -517,17 +517,26 @@ mod test;
 pub use self::test::*;
 
 macro_rules! as_transmute {
-    ($from:ty => $($name:ident -> $to:ident),* $(,)?) => {
+    ($from:ty => $as_from:ident, $($as_to:ident -> $to:ident),* $(,)?) => {
         impl $from {$(
             #[inline]
-            pub(crate) fn $name(self) -> crate::core_arch::simd::$to {
+            pub(crate) fn $as_to(self) -> crate::core_arch::simd::$to {
                 unsafe { transmute(self) }
             }
         )*}
+        $(
+            impl crate::core_arch::simd::$to {
+                #[inline]
+                pub(crate) fn $as_from(self) -> $from {
+                    unsafe { transmute(self) }
+                }
+            }
+        )*
     };
 }
 
 as_transmute!(__m128i =>
+    as_m128i,
     as_u8x16 -> u8x16,
     as_u16x8 -> u16x8,
     as_u32x4 -> u32x4,
@@ -538,6 +547,7 @@ as_transmute!(__m128i =>
     as_i64x2 -> i64x2,
 );
 as_transmute!(__m256i =>
+    as_m256i,
     as_u8x32 -> u8x32,
     as_u16x16 -> u16x16,
     as_u32x8 -> u32x8,
@@ -548,6 +558,7 @@ as_transmute!(__m256i =>
     as_i64x4 -> i64x4,
 );
 as_transmute!(__m512i =>
+    as_m512i,
     as_u8x64 -> u8x64,
     as_u16x32 -> u16x32,
     as_u32x16 -> u32x16,
@@ -558,35 +569,38 @@ as_transmute!(__m512i =>
     as_i64x8 -> i64x8,
 );
 
-as_transmute!(__m128 => as_f32x4 -> f32x4);
-as_transmute!(__m128d => as_f64x2 -> f64x2);
-as_transmute!(__m256 => as_f32x8 -> f32x8);
-as_transmute!(__m256d => as_f64x4 -> f64x4);
-as_transmute!(__m512 => as_f32x16 -> f32x16);
-as_transmute!(__m512d => as_f64x8 -> f64x8);
+as_transmute!(__m128 => as_m128, as_f32x4 -> f32x4);
+as_transmute!(__m128d => as_m128d, as_f64x2 -> f64x2);
+as_transmute!(__m256 => as_m256, as_f32x8 -> f32x8);
+as_transmute!(__m256d => as_m256d, as_f64x4 -> f64x4);
+as_transmute!(__m512 => as_m512, as_f32x16 -> f32x16);
+as_transmute!(__m512d => as_m512d, as_f64x8 -> f64x8);
 
 as_transmute!(__m128bh =>
+    as_m128bh,
     as_u16x8 -> u16x8,
     as_u32x4 -> u32x4,
     as_i16x8 -> i16x8,
     as_i32x4 -> i32x4,
 );
 as_transmute!(__m256bh =>
+    as_m256bh,
     as_u16x16 -> u16x16,
     as_u32x8 -> u32x8,
     as_i16x16 -> i16x16,
     as_i32x8 -> i32x8,
 );
 as_transmute!(__m512bh =>
+    as_m512bh,
     as_u16x32 -> u16x32,
     as_u32x16 -> u32x16,
     as_i16x32 -> i16x32,
     as_i32x16 -> i32x16,
 );
 
-as_transmute!(__m128h => as_f16x8 -> f16x8);
-as_transmute!(__m256h => as_f16x16 -> f16x16);
-as_transmute!(__m512h => as_f16x32 -> f16x32);
+as_transmute!(__m128h => as_m128h, as_f16x8 -> f16x8);
+as_transmute!(__m256h => as_m256h, as_f16x16 -> f16x16);
+as_transmute!(__m512h => as_m512h, as_f16x32 -> f16x32);
 
 mod eflags;
 #[stable(feature = "simd_x86", since = "1.27.0")]

From 9713d4937c4c15806e4604e37d83de231db20052 Mon Sep 17 00:00:00 2001
From: sayantn <sayantn05@gmail.com>
Date: Mon, 28 Apr 2025 11:32:14 +0530
Subject: [PATCH 3/3] Add `avx512vl` requirement to testsuite for avx512fp16
 128 and 256 bit

---
 crates/core_arch/src/x86/avx512fp16.rs    | 40 +++++++++++------------
 crates/core_arch/src/x86/test.rs          |  4 +--
 crates/core_arch/src/x86_64/avx512fp16.rs |  8 ++---
 3 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512fp16.rs b/crates/core_arch/src/x86/avx512fp16.rs
index b6d3d75ed0..b674875893 100644
--- a/crates/core_arch/src/x86/avx512fp16.rs
+++ b/crates/core_arch/src/x86/avx512fp16.rs
@@ -16670,14 +16670,14 @@ mod tests {
         )
     }
 
-    #[simd_test(enable = "avx512fp16")]
+    #[simd_test(enable = "avx512fp16,avx512vl")]
     unsafe fn test_mm_set_ph() {
         let r = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
         let e = _mm_setr_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
         assert_eq_m128h(r, e);
     }
 
-    #[simd_test(enable = "avx512fp16")]
+    #[simd_test(enable = "avx512fp16,avx512vl")]
     unsafe fn test_mm256_set_ph() {
         let r = _mm256_set_ph(
             1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
@@ -16703,21 +16703,21 @@ mod tests {
         assert_eq_m512h(r, e);
     }
 
-    #[simd_test(enable = "avx512fp16")]
+    #[simd_test(enable = "avx512fp16,avx512vl")]
     unsafe fn test_mm_set_sh() {
         let r = _mm_set_sh(1.0);
         let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0);
         assert_eq_m128h(r, e);
     }
 
-    #[simd_test(enable = "avx512fp16")]
+    #[simd_test(enable = "avx512fp16,avx512vl")]
     unsafe fn test_mm_set1_ph() {
         let r = _mm_set1_ph(1.0);
         let e = _mm_set_ph(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0);
         assert_eq_m128h(r, e);
     }
 
-    #[simd_test(enable = "avx512fp16")]
+    #[simd_test(enable = "avx512fp16,avx512vl")]
     unsafe fn test_mm256_set1_ph() {
         let r = _mm256_set1_ph(1.0);
         let e = _mm256_set_ph(
@@ -16736,14 +16736,14 @@ mod tests {
         assert_eq_m512h(r, e);
     }
 
-    #[simd_test(enable = "avx512fp16")]
+    #[simd_test(enable = "avx512fp16,avx512vl")]
     unsafe fn test_mm_setr_ph() {
         let r = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
         let e = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
         assert_eq_m128h(r, e);
     }
 
-    #[simd_test(enable = "avx512fp16")]
+    #[simd_test(enable = "avx512fp16,avx512vl")]
     unsafe fn test_mm256_setr_ph() {
         let r = _mm256_setr_ph(
             1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
@@ -16790,7 +16790,7 @@ mod tests {
         assert_eq_m512h(r, e);
     }
 
-    #[simd_test(enable = "avx512fp16")]
+    #[simd_test(enable = "avx512fp16,avx512vl")]
     unsafe fn test_mm_castsi128_ph() {
         let a = _mm_set1_epi16(0x3c00);
         let r = _mm_castsi128_ph(a);
@@ -16798,7 +16798,7 @@ mod tests {
         assert_eq_m128h(r, e);
     }
 
-    #[simd_test(enable = "avx512fp16")]
+    #[simd_test(enable = "avx512fp16,avx512vl")]
     unsafe fn test_mm256_castsi256_ph() {
         let a = _mm256_set1_epi16(0x3c00);
         let r = _mm256_castsi256_ph(a);
@@ -16838,7 +16838,7 @@ mod tests {
         assert_eq_m512i(r, e);
     }
 
-    #[simd_test(enable = "avx512fp16")]
+    #[simd_test(enable = "avx512fp16,avx512vl")]
     unsafe fn test_mm_castps_ph() {
         let a = _mm_castsi128_ps(_mm_set1_epi16(0x3c00));
         let r = _mm_castps_ph(a);
@@ -16846,7 +16846,7 @@ mod tests {
         assert_eq_m128h(r, e);
     }
 
-    #[simd_test(enable = "avx512fp16")]
+    #[simd_test(enable = "avx512fp16,avx512vl")]
     unsafe fn test_mm256_castps_ph() {
         let a = _mm256_castsi256_ps(_mm256_set1_epi16(0x3c00));
         let r = _mm256_castps_ph(a);
@@ -16886,7 +16886,7 @@ mod tests {
         assert_eq_m512(r, e);
     }
 
-    #[simd_test(enable = "avx512fp16")]
+    #[simd_test(enable = "avx512fp16,avx512vl")]
     unsafe fn test_mm_castpd_ph() {
         let a = _mm_castsi128_pd(_mm_set1_epi16(0x3c00));
         let r = _mm_castpd_ph(a);
@@ -16894,7 +16894,7 @@ mod tests {
         assert_eq_m128h(r, e);
     }
 
-    #[simd_test(enable = "avx512fp16")]
+    #[simd_test(enable = "avx512fp16,avx512vl")]
     unsafe fn test_mm256_castpd_ph() {
         let a = _mm256_castsi256_pd(_mm256_set1_epi16(0x3c00));
         let r = _mm256_castpd_ph(a);
@@ -16934,7 +16934,7 @@ mod tests {
         assert_eq_m512d(r, e);
     }
 
-    #[simd_test(enable = "avx512fp16")]
+    #[simd_test(enable = "avx512fp16,avx512vl")]
     unsafe fn test_mm256_castph256_ph128() {
         let a = _mm256_setr_ph(
             1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
@@ -16944,7 +16944,7 @@ mod tests {
         assert_eq_m128h(r, e);
     }
 
-    #[simd_test(enable = "avx512fp16")]
+    #[simd_test(enable = "avx512fp16,avx512vl")]
     unsafe fn test_mm512_castph512_ph128() {
         let a = _mm512_setr_ph(
             1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19.,
@@ -16955,7 +16955,7 @@ mod tests {
         assert_eq_m128h(r, e);
     }
 
-    #[simd_test(enable = "avx512fp16")]
+    #[simd_test(enable = "avx512fp16,avx512vl")]
     unsafe fn test_mm512_castph512_ph256() {
         let a = _mm512_setr_ph(
             1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19.,
@@ -16968,21 +16968,21 @@ mod tests {
         assert_eq_m256h(r, e);
     }
 
-    #[simd_test(enable = "avx512fp16")]
+    #[simd_test(enable = "avx512fp16,avx512vl")]
     unsafe fn test_mm256_castph128_ph256() {
         let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
         let r = _mm256_castph128_ph256(a);
         assert_eq_m128h(_mm256_castph256_ph128(r), a);
     }
 
-    #[simd_test(enable = "avx512fp16")]
+    #[simd_test(enable = "avx512fp16,avx512vl")]
     unsafe fn test_mm512_castph128_ph512() {
         let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
         let r = _mm512_castph128_ph512(a);
         assert_eq_m128h(_mm512_castph512_ph128(r), a);
     }
 
-    #[simd_test(enable = "avx512fp16")]
+    #[simd_test(enable = "avx512fp16,avx512vl")]
     unsafe fn test_mm512_castph256_ph512() {
         let a = _mm256_setr_ph(
             1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
@@ -16991,7 +16991,7 @@ mod tests {
         assert_eq_m256h(_mm512_castph512_ph256(r), a);
     }
 
-    #[simd_test(enable = "avx512fp16")]
+    #[simd_test(enable = "avx512fp16,avx512vl")]
     unsafe fn test_mm256_zextph128_ph256() {
         let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
         let r = _mm256_zextph128_ph256(a);
diff --git a/crates/core_arch/src/x86/test.rs b/crates/core_arch/src/x86/test.rs
index 050a51f9fa..fec25ce2bc 100644
--- a/crates/core_arch/src/x86/test.rs
+++ b/crates/core_arch/src/x86/test.rs
@@ -37,7 +37,7 @@ pub unsafe fn get_m128(a: __m128, idx: usize) -> f32 {
 }
 
 #[track_caller]
-#[target_feature(enable = "avx512fp16")]
+#[target_feature(enable = "avx512fp16,avx512vl")]
 pub unsafe fn assert_eq_m128h(a: __m128h, b: __m128h) {
     let r = _mm_cmp_ph_mask::<_CMP_EQ_OQ>(a, b);
     if r != 0b1111_1111 {
@@ -87,7 +87,7 @@ pub unsafe fn get_m256(a: __m256, idx: usize) -> f32 {
 }
 
 #[track_caller]
-#[target_feature(enable = "avx512fp16")]
+#[target_feature(enable = "avx512fp16,avx512vl")]
 pub unsafe fn assert_eq_m256h(a: __m256h, b: __m256h) {
     let r = _mm256_cmp_ph_mask::<_CMP_EQ_OQ>(a, b);
     if r != 0b11111111_11111111 {
diff --git a/crates/core_arch/src/x86_64/avx512fp16.rs b/crates/core_arch/src/x86_64/avx512fp16.rs
index 69f1dcb5c7..955c6ccc75 100644
--- a/crates/core_arch/src/x86_64/avx512fp16.rs
+++ b/crates/core_arch/src/x86_64/avx512fp16.rs
@@ -231,7 +231,7 @@ mod tests {
     use crate::core_arch::{x86::*, x86_64::*};
     use stdarch_test::simd_test;
 
-    #[simd_test(enable = "avx512fp16")]
+    #[simd_test(enable = "avx512fp16,avx512vl")]
     unsafe fn test_mm_cvti64_sh() {
         let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
         let r = _mm_cvti64_sh(a, 10);
@@ -239,7 +239,7 @@ mod tests {
         assert_eq_m128h(r, e);
     }
 
-    #[simd_test(enable = "avx512fp16")]
+    #[simd_test(enable = "avx512fp16,avx512vl")]
     unsafe fn test_mm_cvt_roundi64_sh() {
         let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
         let r = _mm_cvt_roundi64_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, 10);
@@ -247,7 +247,7 @@ mod tests {
         assert_eq_m128h(r, e);
     }
 
-    #[simd_test(enable = "avx512fp16")]
+    #[simd_test(enable = "avx512fp16,avx512vl")]
     unsafe fn test_mm_cvtu64_sh() {
         let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
         let r = _mm_cvtu64_sh(a, 10);
@@ -255,7 +255,7 @@ mod tests {
         assert_eq_m128h(r, e);
     }
 
-    #[simd_test(enable = "avx512fp16")]
+    #[simd_test(enable = "avx512fp16,avx512vl")]
     unsafe fn test_mm_cvt_roundu64_sh() {
         let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
         let r = _mm_cvt_roundu64_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, 10);