Skip to content

Implement missing sse4a and tbm intrinsics #1607

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jul 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 61 additions & 3 deletions crates/core_arch/src/x86/sse4a.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,18 @@ use stdarch_test::assert_instr;
extern "C" {
#[link_name = "llvm.x86.sse4a.extrq"]
fn extrq(x: i64x2, y: i8x16) -> i64x2;
#[link_name = "llvm.x86.sse4a.extrqi"]
fn extrqi(x: i64x2, len: u8, idx: u8) -> i64x2;
#[link_name = "llvm.x86.sse4a.insertq"]
fn insertq(x: i64x2, y: i64x2) -> i64x2;
#[link_name = "llvm.x86.sse4a.insertqi"]
fn insertqi(x: i64x2, y: i64x2, len: u8, idx: u8) -> i64x2;
#[link_name = "llvm.x86.sse4a.movnt.sd"]
fn movntsd(x: *mut f64, y: __m128d);
#[link_name = "llvm.x86.sse4a.movnt.ss"]
fn movntss(x: *mut f32, y: __m128);
}

// FIXME(blocked on #248): _mm_extracti_si64(x, len, idx) // EXTRQ
// FIXME(blocked on #248): _mm_inserti_si64(x, y, len, idx) // INSERTQ

/// Extracts the bit range specified by `y` from the lower 64 bits of `x`.
///
/// The `[13:8]` bits of `y` specify the index of the bit-range to extract. The
Expand All @@ -39,6 +40,27 @@ pub unsafe fn _mm_extract_si64(x: __m128i, y: __m128i) -> __m128i {
transmute(extrq(x.as_i64x2(), y.as_i8x16()))
}

/// Extracts the specified bits from the lower 64 bits of the 128-bit integer vector operand at the
/// index `idx` and of the length `len`.
///
/// `idx` specifies the index of the LSB. `len` specifies the number of bits to extract. If length
/// and index are both zero, bits `[63:0]` of parameter `x` are extracted. It is a compile-time error
/// for `len + idx` to be greater than 64 or for `len` to be zero and `idx` to be non-zero.
///
/// Returns a 128-bit integer vector whose lower 64 bits contain the extracted bits.
#[inline]
#[target_feature(enable = "sse4a")]
#[cfg_attr(test, assert_instr(extrq, LEN = 5, IDX = 5))]
#[rustc_legacy_const_generics(1, 2)]
#[unstable(feature = "simd_x86_updates", issue = "126936")]
pub unsafe fn _mm_extracti_si64<const LEN: i32, const IDX: i32>(x: __m128i) -> __m128i {
// LLVM mentions that it is UB if these are not satisfied
static_assert_uimm_bits!(LEN, 6);
static_assert_uimm_bits!(IDX, 6);
static_assert!((LEN == 0 && IDX == 0) || (LEN != 0 && LEN + IDX <= 64));
transmute(extrqi(x.as_i64x2(), LEN as u8, IDX as u8))
}

/// Inserts the `[length:0]` bits of `y` into `x` at `index`.
///
/// The bits of `y`:
Expand All @@ -56,6 +78,25 @@ pub unsafe fn _mm_insert_si64(x: __m128i, y: __m128i) -> __m128i {
transmute(insertq(x.as_i64x2(), y.as_i64x2()))
}

/// Inserts the `len` least-significant bits from the lower 64 bits of the 128-bit integer vector operand `y` into
/// the lower 64 bits of the 128-bit integer vector operand `x` at the index `idx` and of the length `len`.
///
/// `idx` specifies the index of the LSB. `len` specifies the number of bits to insert. If length and index
/// are both zero, bits `[63:0]` of parameter `x` are replaced with bits `[63:0]` of parameter `y`. It is a
/// compile-time error for `len + idx` to be greater than 64 or for `len` to be zero and `idx` to be non-zero.
#[inline]
#[target_feature(enable = "sse4a")]
#[cfg_attr(test, assert_instr(insertq, LEN = 5, IDX = 5))]
#[rustc_legacy_const_generics(2, 3)]
#[unstable(feature = "simd_x86_updates", issue = "126936")]
pub unsafe fn _mm_inserti_si64<const LEN: i32, const IDX: i32>(x: __m128i, y: __m128i) -> __m128i {
// LLVM mentions that it is UB if these are not satisfied
static_assert_uimm_bits!(LEN, 6);
static_assert_uimm_bits!(IDX, 6);
static_assert!((LEN == 0 && IDX == 0) || (LEN != 0 && LEN + IDX <= 64));
transmute(insertqi(x.as_i64x2(), y.as_i64x2(), LEN as u8, IDX as u8))
}

/// Non-temporal store of `a.0` into `p`.
///
/// Writes 64-bit data to a memory location without polluting the caches.
Expand Down Expand Up @@ -114,6 +155,14 @@ mod tests {
assert_eq_m128i(r, e);
}

#[simd_test(enable = "sse4a")]
unsafe fn test_mm_extracti_si64() {
let a = _mm_setr_epi64x(0x0123456789abcdef, 0);
let r = _mm_extracti_si64::<8, 8>(a);
let e = _mm_setr_epi64x(0xcd, 0);
assert_eq_m128i(r, e);
}

#[simd_test(enable = "sse4a")]
unsafe fn test_mm_insert_si64() {
let i = 0b0110_i64;
Expand All @@ -131,6 +180,15 @@ mod tests {
assert_eq_m128i(r, expected);
}

#[simd_test(enable = "sse4a")]
unsafe fn test_mm_inserti_si64() {
let a = _mm_setr_epi64x(0x0123456789abcdef, 0);
let b = _mm_setr_epi64x(0x0011223344556677, 0);
let r = _mm_inserti_si64::<8, 8>(a, b);
let e = _mm_setr_epi64x(0x0123456789ab77ef, 0);
assert_eq_m128i(r, e);
}

#[repr(align(16))]
struct MemoryF64 {
data: [f64; 2],
Expand Down
Loading