Skip to content

Commit e6ff097

Browse files
paolotetialexcrichton
authored andcommitted
ARM DSP: add Quad/Double add/sub with exchange and select bytes intrinsics (#532)
- Quad 8-bit addition/subtraction - Double 8-bit addition/subtraction - Saturating Add and Subtract with Exchange and Saturating Subtract and Add with Exchange, signed - Select bytes based on GE bits This patch bump the `assert_instr` limit to 22 (from 20) instead of add a lots of exception for all DSP intrinsics.
1 parent b9de11a commit e6ff097

File tree

2 files changed

+187
-23
lines changed

2 files changed

+187
-23
lines changed

coresimd/arm/dsp.rs

+182-17
Original file line numberDiff line numberDiff line change
@@ -14,24 +14,48 @@ types! {
1414
pub struct uint16x2_t(u16, u16);
1515
}
1616

17+
macro_rules! dsp_call {
18+
($name:expr, $a:expr, $b:expr) => {
19+
::mem::transmute($name(::mem::transmute($a), ::mem::transmute($b)))
20+
};
21+
}
22+
1723
extern "C" {
18-
#[cfg_attr(target_arch = "arm", link_name = "llvm.arm.qadd")]
24+
#[link_name = "llvm.arm.qadd"]
1925
fn arm_qadd(a: i32, b: i32) -> i32;
2026

21-
#[cfg_attr(target_arch = "arm", link_name = "llvm.arm.qsub")]
22-
fn arm_qsub(a: i32, b: i32) -> i32;
27+
#[link_name = "llvm.arm.qadd16"]
28+
fn arm_qadd16(a: i32, b: i32) -> i32;
2329

24-
#[cfg_attr(target_arch = "arm", link_name = "llvm.arm.qadd8")]
30+
#[link_name = "llvm.arm.qadd8"]
2531
fn arm_qadd8(a: i32, b: i32) -> i32;
2632

27-
#[cfg_attr(target_arch = "arm", link_name = "llvm.arm.qsub8")]
28-
fn arm_qsub8(a: i32, b: i32) -> i32;
33+
#[link_name = "llvm.arm.qasx"]
34+
fn arm_qasx(a: i32, b: i32) -> i32;
2935

30-
#[cfg_attr(target_arch = "arm", link_name = "llvm.arm.qadd16")]
31-
fn arm_qadd16(a: i32, b: i32) -> i32;
36+
#[link_name = "llvm.arm.qsax"]
37+
fn arm_qsax(a: i32, b: i32) -> i32;
3238

33-
#[cfg_attr(target_arch = "arm", link_name = "llvm.arm.qsub16")]
39+
#[link_name = "llvm.arm.qsub"]
40+
fn arm_qsub(a: i32, b: i32) -> i32;
41+
42+
#[link_name = "llvm.arm.qsub8"]
43+
fn arm_qsub8(a: i32, b: i32) -> i32;
44+
45+
#[link_name = "llvm.arm.qsub16"]
3446
fn arm_qsub16(a: i32, b: i32) -> i32;
47+
48+
#[link_name = "llvm.arm.sadd16"]
49+
fn arm_sadd16(a: i32, b: i32) -> i32;
50+
51+
#[link_name = "llvm.arm.sadd8"]
52+
fn arm_sadd8(a: i32, b: i32) -> i32;
53+
54+
#[link_name = "llvm.arm.sasx"]
55+
fn arm_sasx(a: i32, b: i32) -> i32;
56+
57+
#[cfg_attr(not(target_feature = "mclass"), link_name = "llvm.arm.sel")]
58+
fn arm_sel(a: i32, b: i32) -> i32;
3559
}
3660

3761
/// Signed saturating addition
@@ -63,7 +87,7 @@ pub unsafe fn qsub(a: i32, b: i32) -> i32 {
6387
#[inline]
6488
#[cfg_attr(test, assert_instr(qadd8))]
6589
pub unsafe fn qadd8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
66-
::mem::transmute(arm_qadd8(::mem::transmute(a), ::mem::transmute(b)))
90+
dsp_call!(arm_qadd8, a, b)
6791
}
6892

6993
/// Saturating two 8-bit integer subtraction
@@ -77,7 +101,7 @@ pub unsafe fn qadd8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
77101
#[inline]
78102
#[cfg_attr(test, assert_instr(qsub8))]
79103
pub unsafe fn qsub8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
80-
::mem::transmute(arm_qsub8(::mem::transmute(a), ::mem::transmute(b)))
104+
dsp_call!(arm_qsub8, a, b)
81105
}
82106

83107
/// Saturating two 16-bit integer subtraction
@@ -89,7 +113,7 @@ pub unsafe fn qsub8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
89113
#[inline]
90114
#[cfg_attr(test, assert_instr(qsub16))]
91115
pub unsafe fn qsub16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
92-
::mem::transmute(arm_qsub16(::mem::transmute(a), ::mem::transmute(b)))
116+
dsp_call!(arm_qsub16, a, b)
93117
}
94118

95119
/// Saturating two 16-bit integer additions
@@ -101,7 +125,80 @@ pub unsafe fn qsub16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
101125
#[inline]
102126
#[cfg_attr(test, assert_instr(qadd16))]
103127
pub unsafe fn qadd16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
104-
::mem::transmute(arm_qadd16(::mem::transmute(a), ::mem::transmute(b)))
128+
dsp_call!(arm_qadd16, a, b)
129+
}
130+
131+
/// Returns the 16-bit signed saturated equivalent of
132+
///
133+
/// res[0] = a[0] - b[1]
134+
/// res[1] = a[1] + b[0]
135+
#[inline]
136+
#[cfg_attr(test, assert_instr(qasx))]
137+
pub unsafe fn qasx(a: int16x2_t, b: int16x2_t) -> int16x2_t {
138+
dsp_call!(arm_qasx, a, b)
139+
}
140+
141+
/// Returns the 16-bit signed saturated equivalent of
142+
///
143+
/// res[0] = a[0] + b[1]
144+
/// res[1] = a[1] - b[0]
145+
#[inline]
146+
#[cfg_attr(test, assert_instr(qsax))]
147+
pub unsafe fn qsax(a: int16x2_t, b: int16x2_t) -> int16x2_t {
148+
dsp_call!(arm_qsax, a, b)
149+
}
150+
151+
/// Returns the 16-bit signed saturated equivalent of
152+
///
153+
/// res[0] = a[0] + b[1]
154+
/// res[1] = a[1] + b[0]
155+
///
156+
/// and the GE bits of the APSR are set.
157+
#[inline]
158+
#[cfg_attr(test, assert_instr(sadd16))]
159+
pub unsafe fn sadd16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
160+
dsp_call!(arm_sadd16, a, b)
161+
}
162+
163+
/// Returns the 8-bit signed saturated equivalent of
164+
///
165+
/// res[0] = a[0] + b[1]
166+
/// res[1] = a[1] + b[0]
167+
/// res[2] = a[2] + b[2]
168+
/// res[3] = a[3] + b[3]
169+
///
170+
/// and the GE bits of the APSR are set.
171+
#[inline]
172+
#[cfg_attr(test, assert_instr(sadd8))]
173+
pub unsafe fn sadd8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
174+
dsp_call!(arm_sadd8, a, b)
175+
}
176+
177+
/// Returns the 16-bit signed equivalent of
178+
///
179+
/// res[0] = a[0] - b[1]
180+
/// res[1] = a[1] + b[0]
181+
///
182+
/// and the GE bits of the APSR are set.
183+
#[inline]
184+
#[cfg_attr(test, assert_instr(sasx))]
185+
pub unsafe fn sasx(a: int16x2_t, b: int16x2_t) -> int16x2_t {
186+
dsp_call!(arm_sasx, a, b)
187+
}
188+
189+
/// Returns the equivalent of
190+
///
191+
/// res[0] = GE[0] ? a[0] : b[0]
192+
/// res[1] = GE[1] ? a[1] : b[1]
193+
/// res[2] = GE[2] ? a[2] : b[2]
194+
/// res[3] = GE[3] ? a[3] : b[3]
195+
///
196+
/// where GE are bits of APSR
197+
#[inline]
198+
#[cfg_attr(test, assert_instr(sel))]
199+
#[cfg(all(not(target_feature = "mclass")))]
200+
pub unsafe fn sel(a: int8x4_t, b: int8x4_t) -> int8x4_t {
201+
dsp_call!(arm_sel, a, b)
105202
}
106203

107204
#[cfg(test)]
@@ -135,7 +232,7 @@ mod tests {
135232
let a = i8x4::new(1, 2, 3, ::std::i8::MAX);
136233
let b = i8x4::new(2, -1, 0, 1);
137234
let c = i8x4::new(3, 1, 3, ::std::i8::MAX);
138-
let r: i8x4 = ::mem::transmute(dsp::qadd8(::mem::transmute(a), ::mem::transmute(b)));
235+
let r: i8x4 = dsp_call!(dsp::qadd8, a, b);
139236
assert_eq!(r, c);
140237
}
141238
}
@@ -146,7 +243,7 @@ mod tests {
146243
let a = i8x4::new(1, 2, 3, ::std::i8::MIN);
147244
let b = i8x4::new(2, -1, 0, 1);
148245
let c = i8x4::new(-1, 3, 3, ::std::i8::MIN);
149-
let r: i8x4 = ::mem::transmute(dsp::qsub8(::mem::transmute(a),::mem::transmute(b)));
246+
let r: i8x4 = dsp_call!(dsp::qsub8, a, b);
150247
assert_eq!(r, c);
151248
}
152249
}
@@ -157,7 +254,7 @@ mod tests {
157254
let a = i16x2::new(1, 2);
158255
let b = i16x2::new(2, -1);
159256
let c = i16x2::new(3, 1);
160-
let r: i16x2 = ::mem::transmute(dsp::qadd16(::mem::transmute(a),::mem::transmute(b)));
257+
let r: i16x2 = dsp_call!(dsp::qadd16, a, b);
161258
assert_eq!(r, c);
162259
}
163260
}
@@ -168,7 +265,75 @@ mod tests {
168265
let a = i16x2::new(10, 20);
169266
let b = i16x2::new(20, -10);
170267
let c = i16x2::new(-10, 30);
171-
let r: i16x2 = ::mem::transmute(dsp::qsub16(::mem::transmute(a), ::mem::transmute(b)));
268+
let r: i16x2 = dsp_call!(dsp::qsub16, a, b);
269+
assert_eq!(r, c);
270+
}
271+
}
272+
273+
#[test]
274+
fn qasx() {
275+
unsafe {
276+
let a = i16x2::new(1, ::std::i16::MAX);
277+
let b = i16x2::new(2, 2);
278+
let c = i16x2::new(-1, ::std::i16::MAX);
279+
let r: i16x2 = dsp_call!(dsp::qasx, a, b);
280+
assert_eq!(r, c);
281+
}
282+
}
283+
284+
#[test]
285+
fn qsax() {
286+
unsafe {
287+
let a = i16x2::new(1, ::std::i16::MAX);
288+
let b = i16x2::new(2, 2);
289+
let c = i16x2::new(3, ::std::i16::MAX - 2);
290+
let r: i16x2 = dsp_call!(dsp::qsax, a, b);
291+
assert_eq!(r, c);
292+
}
293+
}
294+
295+
#[test]
296+
fn sadd16() {
297+
unsafe {
298+
let a = i16x2::new(1, ::std::i16::MAX);
299+
let b = i16x2::new(2, 2);
300+
let c = i16x2::new(3, -::std::i16::MAX);
301+
let r: i16x2 = dsp_call!(dsp::sadd16, a, b);
302+
assert_eq!(r, c);
303+
}
304+
}
305+
306+
#[test]
307+
fn sadd8() {
308+
unsafe {
309+
let a = i8x4::new(1, 2, 3, ::std::i8::MAX);
310+
let b = i8x4::new(4, 3, 2, 2);
311+
let c = i8x4::new(5, 5, 5, -::std::i8::MAX);
312+
let r: i8x4 = dsp_call!(dsp::sadd8, a, b);
313+
assert_eq!(r, c);
314+
}
315+
}
316+
317+
#[test]
318+
fn sasx() {
319+
unsafe {
320+
let a = i16x2::new(1, 2);
321+
let b = i16x2::new(2, 1);
322+
let c = i16x2::new(0, 4);
323+
let r: i16x2 = dsp_call!(dsp::sasx, a, b);
324+
assert_eq!(r, c);
325+
}
326+
}
327+
328+
#[test]
329+
fn sel() {
330+
unsafe {
331+
let a = i8x4::new(1, 2, 3, ::std::i8::MAX);
332+
let b = i8x4::new(4, 3, 2, 2);
333+
// call sadd8() to set GE bits
334+
dsp::sadd8(::mem::transmute(a), ::mem::transmute(b));
335+
let c = i8x4::new(1, 2, 3, ::std::i8::MAX);
336+
let r: i8x4 = dsp_call!(dsp::sel, a, b);
172337
assert_eq!(r, c);
173338
}
174339
}

crates/stdsimd-test/src/lib.rs

+5-6
Original file line numberDiff line numberDiff line change
@@ -338,7 +338,7 @@ pub fn assert(fnptr: usize, fnname: &str, expected: &str) {
338338

339339
let instruction_limit = match expected {
340340
// cpuid returns a pretty big aggregate structure so excempt it from
341-
// the slightly more restrictive 20 instructions below
341+
// the slightly more restrictive 22 instructions below
342342
"cpuid" => 30,
343343

344344
// Apparently on Windows LLVM generates a bunch of saves/restores of
@@ -351,11 +351,10 @@ pub fn assert(fnptr: usize, fnname: &str, expected: &str) {
351351
// cases exceed the limit.
352352
"cvtpi2ps" => 25,
353353

354-
// In this case the overall length, counting also the 'mergefunc'
355-
// workaround overhead, is exactly 20 instructions.
356-
"qsub8" | "qadd8" | "qsub16" | "qadd16" => 22,
357-
358-
_ => 20,
354+
// Original limit was 20 instructions, but ARM DSP Intrinsics are
355+
// exactly 20 instructions long. So bump the limit to 22 instead of
356+
// adding here a long list of expections.
357+
_ => 22,
359358
};
360359
let probably_only_one_instruction = instrs.len() < instruction_limit;
361360

0 commit comments

Comments
 (0)