Skip to content

ARM DSP: add quad/double add/sub + exchange and select bytes intrinsics #532

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jul 21, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
199 changes: 182 additions & 17 deletions coresimd/arm/dsp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,24 +14,48 @@ types! {
pub struct uint16x2_t(u16, u16);
}

macro_rules! dsp_call {
($name:expr, $a:expr, $b:expr) => {
::mem::transmute($name(::mem::transmute($a), ::mem::transmute($b)))
};
}

extern "C" {
#[cfg_attr(target_arch = "arm", link_name = "llvm.arm.qadd")]
#[link_name = "llvm.arm.qadd"]
fn arm_qadd(a: i32, b: i32) -> i32;

#[cfg_attr(target_arch = "arm", link_name = "llvm.arm.qsub")]
fn arm_qsub(a: i32, b: i32) -> i32;
#[link_name = "llvm.arm.qadd16"]
fn arm_qadd16(a: i32, b: i32) -> i32;

#[cfg_attr(target_arch = "arm", link_name = "llvm.arm.qadd8")]
#[link_name = "llvm.arm.qadd8"]
fn arm_qadd8(a: i32, b: i32) -> i32;

#[cfg_attr(target_arch = "arm", link_name = "llvm.arm.qsub8")]
fn arm_qsub8(a: i32, b: i32) -> i32;
#[link_name = "llvm.arm.qasx"]
fn arm_qasx(a: i32, b: i32) -> i32;

#[cfg_attr(target_arch = "arm", link_name = "llvm.arm.qadd16")]
fn arm_qadd16(a: i32, b: i32) -> i32;
#[link_name = "llvm.arm.qsax"]
fn arm_qsax(a: i32, b: i32) -> i32;

#[cfg_attr(target_arch = "arm", link_name = "llvm.arm.qsub16")]
#[link_name = "llvm.arm.qsub"]
fn arm_qsub(a: i32, b: i32) -> i32;

#[link_name = "llvm.arm.qsub8"]
fn arm_qsub8(a: i32, b: i32) -> i32;

#[link_name = "llvm.arm.qsub16"]
fn arm_qsub16(a: i32, b: i32) -> i32;

#[link_name = "llvm.arm.sadd16"]
fn arm_sadd16(a: i32, b: i32) -> i32;

#[link_name = "llvm.arm.sadd8"]
fn arm_sadd8(a: i32, b: i32) -> i32;

#[link_name = "llvm.arm.sasx"]
fn arm_sasx(a: i32, b: i32) -> i32;

#[cfg_attr(not(target_feature = "mclass"), link_name = "llvm.arm.sel")]
fn arm_sel(a: i32, b: i32) -> i32;
}

/// Signed saturating addition
Expand Down Expand Up @@ -63,7 +87,7 @@ pub unsafe fn qsub(a: i32, b: i32) -> i32 {
#[inline]
#[cfg_attr(test, assert_instr(qadd8))]
pub unsafe fn qadd8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
::mem::transmute(arm_qadd8(::mem::transmute(a), ::mem::transmute(b)))
dsp_call!(arm_qadd8, a, b)
}

/// Saturating two 8-bit integer subtraction
Expand All @@ -77,7 +101,7 @@ pub unsafe fn qadd8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
#[inline]
#[cfg_attr(test, assert_instr(qsub8))]
pub unsafe fn qsub8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
::mem::transmute(arm_qsub8(::mem::transmute(a), ::mem::transmute(b)))
dsp_call!(arm_qsub8, a, b)
}

/// Saturating two 16-bit integer subtraction
Expand All @@ -89,7 +113,7 @@ pub unsafe fn qsub8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
#[inline]
#[cfg_attr(test, assert_instr(qsub16))]
pub unsafe fn qsub16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
::mem::transmute(arm_qsub16(::mem::transmute(a), ::mem::transmute(b)))
dsp_call!(arm_qsub16, a, b)
}

/// Saturating two 16-bit integer additions
Expand All @@ -101,7 +125,80 @@ pub unsafe fn qsub16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
#[inline]
#[cfg_attr(test, assert_instr(qadd16))]
pub unsafe fn qadd16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
::mem::transmute(arm_qadd16(::mem::transmute(a), ::mem::transmute(b)))
dsp_call!(arm_qadd16, a, b)
}

/// Returns the 16-bit signed saturated equivalent of
///
/// res[0] = a[0] - b[1]
/// res[1] = a[1] + b[0]
#[inline]
#[cfg_attr(test, assert_instr(qasx))]
pub unsafe fn qasx(a: int16x2_t, b: int16x2_t) -> int16x2_t {
dsp_call!(arm_qasx, a, b)
}

/// Returns the 16-bit signed saturated equivalent of
///
/// res[0] = a[0] + b[1]
/// res[1] = a[1] - b[0]
#[inline]
#[cfg_attr(test, assert_instr(qsax))]
pub unsafe fn qsax(a: int16x2_t, b: int16x2_t) -> int16x2_t {
dsp_call!(arm_qsax, a, b)
}

/// Returns the 16-bit signed saturated equivalent of
///
/// res[0] = a[0] + b[1]
/// res[1] = a[1] + b[0]
///
/// and the GE bits of the APSR are set.
#[inline]
#[cfg_attr(test, assert_instr(sadd16))]
pub unsafe fn sadd16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
dsp_call!(arm_sadd16, a, b)
}

/// Returns the 8-bit signed saturated equivalent of
///
/// res[0] = a[0] + b[1]
/// res[1] = a[1] + b[0]
/// res[2] = a[2] + b[2]
/// res[3] = a[3] + b[3]
///
/// and the GE bits of the APSR are set.
#[inline]
#[cfg_attr(test, assert_instr(sadd8))]
pub unsafe fn sadd8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
dsp_call!(arm_sadd8, a, b)
}

/// Returns the 16-bit signed equivalent of
///
/// res[0] = a[0] - b[1]
/// res[1] = a[1] + b[0]
///
/// and the GE bits of the APSR are set.
#[inline]
#[cfg_attr(test, assert_instr(sasx))]
pub unsafe fn sasx(a: int16x2_t, b: int16x2_t) -> int16x2_t {
dsp_call!(arm_sasx, a, b)
}

/// Returns the equivalent of
///
/// res[0] = GE[0] ? a[0] : b[0]
/// res[1] = GE[1] ? a[1] : b[1]
/// res[2] = GE[2] ? a[2] : b[2]
/// res[3] = GE[3] ? a[3] : b[3]
///
/// where GE are bits of APSR
#[inline]
#[cfg_attr(test, assert_instr(sel))]
#[cfg(all(not(target_feature = "mclass")))]
pub unsafe fn sel(a: int8x4_t, b: int8x4_t) -> int8x4_t {
dsp_call!(arm_sel, a, b)
}

#[cfg(test)]
Expand Down Expand Up @@ -135,7 +232,7 @@ mod tests {
let a = i8x4::new(1, 2, 3, ::std::i8::MAX);
let b = i8x4::new(2, -1, 0, 1);
let c = i8x4::new(3, 1, 3, ::std::i8::MAX);
let r: i8x4 = ::mem::transmute(dsp::qadd8(::mem::transmute(a), ::mem::transmute(b)));
let r: i8x4 = dsp_call!(dsp::qadd8, a, b);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is the macro being used in the tests?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to cut long lines. For sure I can use ::mem::transmute, but for my understanding why is it wrong here?

assert_eq!(r, c);
}
}
Expand All @@ -146,7 +243,7 @@ mod tests {
let a = i8x4::new(1, 2, 3, ::std::i8::MIN);
let b = i8x4::new(2, -1, 0, 1);
let c = i8x4::new(-1, 3, 3, ::std::i8::MIN);
let r: i8x4 = ::mem::transmute(dsp::qsub8(::mem::transmute(a),::mem::transmute(b)));
let r: i8x4 = dsp_call!(dsp::qsub8, a, b);
assert_eq!(r, c);
}
}
Expand All @@ -157,7 +254,7 @@ mod tests {
let a = i16x2::new(1, 2);
let b = i16x2::new(2, -1);
let c = i16x2::new(3, 1);
let r: i16x2 = ::mem::transmute(dsp::qadd16(::mem::transmute(a),::mem::transmute(b)));
let r: i16x2 = dsp_call!(dsp::qadd16, a, b);
assert_eq!(r, c);
}
}
Expand All @@ -168,7 +265,75 @@ mod tests {
let a = i16x2::new(10, 20);
let b = i16x2::new(20, -10);
let c = i16x2::new(-10, 30);
let r: i16x2 = ::mem::transmute(dsp::qsub16(::mem::transmute(a), ::mem::transmute(b)));
let r: i16x2 = dsp_call!(dsp::qsub16, a, b);
assert_eq!(r, c);
}
}

#[test]
fn qasx() {
unsafe {
let a = i16x2::new(1, ::std::i16::MAX);
let b = i16x2::new(2, 2);
let c = i16x2::new(-1, ::std::i16::MAX);
let r: i16x2 = dsp_call!(dsp::qasx, a, b);
assert_eq!(r, c);
}
}

#[test]
fn qsax() {
unsafe {
let a = i16x2::new(1, ::std::i16::MAX);
let b = i16x2::new(2, 2);
let c = i16x2::new(3, ::std::i16::MAX - 2);
let r: i16x2 = dsp_call!(dsp::qsax, a, b);
assert_eq!(r, c);
}
}

#[test]
fn sadd16() {
unsafe {
let a = i16x2::new(1, ::std::i16::MAX);
let b = i16x2::new(2, 2);
let c = i16x2::new(3, -::std::i16::MAX);
let r: i16x2 = dsp_call!(dsp::sadd16, a, b);
assert_eq!(r, c);
}
}

#[test]
fn sadd8() {
unsafe {
let a = i8x4::new(1, 2, 3, ::std::i8::MAX);
let b = i8x4::new(4, 3, 2, 2);
let c = i8x4::new(5, 5, 5, -::std::i8::MAX);
let r: i8x4 = dsp_call!(dsp::sadd8, a, b);
assert_eq!(r, c);
}
}

#[test]
fn sasx() {
unsafe {
let a = i16x2::new(1, 2);
let b = i16x2::new(2, 1);
let c = i16x2::new(0, 4);
let r: i16x2 = dsp_call!(dsp::sasx, a, b);
assert_eq!(r, c);
}
}

#[test]
fn sel() {
unsafe {
let a = i8x4::new(1, 2, 3, ::std::i8::MAX);
let b = i8x4::new(4, 3, 2, 2);
// call sadd8() to set GE bits
dsp::sadd8(::mem::transmute(a), ::mem::transmute(b));
let c = i8x4::new(1, 2, 3, ::std::i8::MAX);
let r: i8x4 = dsp_call!(dsp::sel, a, b);
assert_eq!(r, c);
}
}
Expand Down
11 changes: 5 additions & 6 deletions crates/stdsimd-test/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -338,7 +338,7 @@ pub fn assert(fnptr: usize, fnname: &str, expected: &str) {

let instruction_limit = match expected {
// cpuid returns a pretty big aggregate structure so excempt it from
// the slightly more restrictive 20 instructions below
// the slightly more restrictive 22 instructions below
"cpuid" => 30,

// Apparently on Windows LLVM generates a bunch of saves/restores of
Expand All @@ -351,11 +351,10 @@ pub fn assert(fnptr: usize, fnname: &str, expected: &str) {
// cases exceed the limit.
"cvtpi2ps" => 25,

// In this case the overall length, counting also the 'mergefunc'
// workaround overhead, is exactly 20 instructions.
"qsub8" | "qadd8" | "qsub16" | "qadd16" => 22,

_ => 20,
// Original limit was 20 instructions, but ARM DSP Intrinsics are
// exactly 20 instructions long. So bump the limit to 22 instead of
// adding here a long list of expections.
_ => 22,
};
let probably_only_one_instruction = instrs.len() < instruction_limit;

Expand Down