Skip to content

Commit ccd9bb2

Browse files
xqftmpaulucciIAvecillaklaus993MauroToscano
authored
Implemented POC of parallel FFT and twiddle generation in Metal (#190)
* Added FFTMetalState * Added device field to FFTMetalState * Added WIP twiddle factors generator in GPU * Small fixes in WIP twiddle GPU generator * Changed twiddle generator function constants to buffers, formatting and small fixes * Removed unused helper, formatting * [WIP] Workaround for result buffer, completed gen_twiddles, tests * Update compiled metal lib * Small fixes in gen_twiddles * Changed gen_twiddles_cpu loop * Fix twiddle generation on gpu. * Fix artifacts from broken main * Fixed threadgroup dispatching from twiddles * [WIP] POC Parallel FFT * Move all gpu related features to a new crate * [WIP] Fix metal kernel * [WIP] Various changes related to Metal FFT * [WIP] For testing purposes * Use real U32 field. * [WIP] Fixed undefined behaviours * Finished FFT in Metal POC * Removed unused line * Added comments for unsafe lines * Changed storage mode for metal buffers * Replaced static array with unsafe vector from pointer * Proptest for Metal FFT * Simplified stage loop * Split Metal setup from execution Will be useful for benchmarks * Fixed basetype size * Changed FFT Metal test name * Small fix * Fixed test memory leaks * Added benchmarks * Fixed leaks on benchmarks * Moved and revamped parallel twiddle generation * Formatting * Add build-essential to try to fix dependency problem * Add sudo to apt install * Swap apt install to g++ * Try other solutions to dependency problem * Try reinstall g++ * Try installing gobjc++-mingw * Try gobjc++ * Try gobjc * Try all * Add gnustep-base-common * Try cargo check as command * Try CFLAGS * Update CFLAGS * Export CFLAGS * Update CFLAGS * Try cross * Fix syntax * Try CFLAGS * Fix syntax * Try macos runners * Swap other CI steps to macos runner * Delete unused function * Add benchmark for twiddles to the general benchmark function * Add conditional compilation for objc crate * Swap runners back to ubuntu * Add conditional compilation for metal crate to compile only in macos * Revert conditional compilation for metal and objc crates * Deleted operations.rs * Added newline at end of fp.h.metal * Reverted name change of U64TestField * Change runners in CI to run in macos * Ignore metal tests * Removed some unsafe blocks, added comments * Removed metal dependency from math crate * Removed GPU from default members * Reverted to use ubuntu on CI * Changed coverage host to macos --------- Co-authored-by: Martin Paulucci <[email protected]> Co-authored-by: IAvecilla <[email protected]> Co-authored-by: klaus993 <[email protected]> Co-authored-by: MauroFab <[email protected]>
1 parent 43cc888 commit ccd9bb2

File tree

19 files changed

+724
-3
lines changed

19 files changed

+724
-3
lines changed

.github/workflows/tests.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ jobs:
3030
command: check
3131

3232
coverage:
33-
runs-on: ubuntu-latest
33+
runs-on: macos-latest
3434
env:
3535
CARGO_TERM_COLOR: always
3636
steps:

Cargo.toml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,12 @@
11
[workspace]
2-
2+
default-members = [
3+
"math",
4+
"crypto",
5+
"proving_system/stark"
6+
]
37
members = [
48
"math",
59
"crypto",
610
"proving_system/stark",
11+
"gpu",
712
]

gpu/Cargo.toml

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
[package]
2+
name = "lambdaworks-gpu"
3+
version = "0.1.0"
4+
edition = "2021"
5+
6+
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
7+
8+
[dependencies]
9+
lambdaworks-math = { path = "../math" }
10+
lambdaworks-crypto = { path = "../crypto"}
11+
rand = "0.8.5"
12+
metal = "0.24.0"
13+
14+
[dev-dependencies]
15+
proptest = "1.1.0"
16+
criterion = "0.4.0"
17+
objc = "0.2.7"
18+
19+
[[bench]]
20+
name = "metal_benchmarks"
21+
harness = false

gpu/benches/all_benchmarks.rs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
use criterion::{criterion_group, criterion_main, Criterion};
2+
3+
mod benchmarks;
4+
5+
fn run_all_benchmarks(c: &mut Criterion) {
6+
benchmarks::metal::metal_fft_twiddles_benchmarks(c);
7+
benchmarks::metal::metal_fft_benchmarks(c);
8+
}
9+
10+
criterion_group!(benches, run_all_benchmarks);
11+
criterion_main!(benches);

gpu/benches/benchmarks/metal.rs

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
use criterion::Criterion;
2+
use lambdaworks_gpu::fft::fft_metal::*;
3+
use lambdaworks_math::{
4+
fft::bit_reversing::in_place_bit_reverse_permute,
5+
field::{element::FieldElement, traits::IsTwoAdicField},
6+
field::{test_fields::u32_test_field::U32TestField, traits::RootsConfig},
7+
};
8+
use rand::random;
9+
10+
type F = U32TestField;
11+
type FE = FieldElement<F>;
12+
13+
fn gen_coeffs(pow: usize) -> Vec<FE> {
14+
let mut result = Vec::with_capacity(1 << pow);
15+
for _ in 0..result.capacity() {
16+
result.push(FE::new(random()));
17+
}
18+
result
19+
}
20+
21+
pub fn metal_fft_benchmarks(c: &mut Criterion) {
22+
let mut group = c.benchmark_group("metal_fft");
23+
24+
for order in 20..=24 {
25+
let coeffs = gen_coeffs(order);
26+
group.throughput(criterion::Throughput::Elements(1 << order)); // info for criterion
27+
28+
// the objective is to bench ordered FFT, including twiddles generation and Metal setup
29+
group.bench_with_input(
30+
format!("parallel_nr_2radix_2^{order}_coeffs"),
31+
&coeffs,
32+
|bench, coeffs| {
33+
bench.iter(|| {
34+
// TODO: autoreleaspool hurts perf. by 2-3%. Search for an alternative
35+
objc::rc::autoreleasepool(|| {
36+
let coeffs = coeffs.clone();
37+
let twiddles =
38+
F::get_twiddles(order as u64, RootsConfig::BitReverse).unwrap();
39+
let fft_metal = FFTMetalState::new(None).unwrap();
40+
let command_buff_encoder = fft_metal
41+
.setup_fft("radix2_dit_butterfly", &twiddles)
42+
.unwrap();
43+
44+
let mut result = fft_metal
45+
.execute_fft(&coeffs, command_buff_encoder)
46+
.unwrap();
47+
48+
in_place_bit_reverse_permute(&mut result);
49+
});
50+
});
51+
},
52+
);
53+
}
54+
55+
group.finish();
56+
}
57+
58+
pub fn metal_fft_twiddles_benchmarks(c: &mut Criterion) {
59+
let mut group = c.benchmark_group("metal_fft");
60+
group.sample_size(10); // it becomes too slow with the default of 100
61+
62+
for order in 2..=4 {
63+
group.throughput(criterion::Throughput::Elements(1 << order)); // info for criterion
64+
65+
// the objective is to bench ordered FFT, including twiddles generation and Metal setup
66+
group.bench_with_input(
67+
format!("parallel_twiddle_factors_2^({order}-1)_elems"),
68+
&order,
69+
|bench, order| {
70+
bench.iter(|| {
71+
// TODO: autoreleaspool hurts perf. by 2-3%. Search for an alternative
72+
objc::rc::autoreleasepool(|| {
73+
let metal_state = FFTMetalState::new(None).unwrap();
74+
let _gpu_twiddles = metal_state.gen_twiddles::<F>(*order).unwrap();
75+
});
76+
});
77+
},
78+
);
79+
}
80+
81+
group.finish();
82+
}

gpu/benches/benchmarks/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
pub mod metal;

gpu/benches/metal_benchmarks.rs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
use criterion::{criterion_group, criterion_main, Criterion};
2+
3+
mod benchmarks;
4+
5+
fn run_metal_benchmarks(c: &mut Criterion) {
6+
benchmarks::metal::metal_fft_twiddles_benchmarks(c);
7+
benchmarks::metal::metal_fft_benchmarks(c);
8+
}
9+
10+
criterion_group!(benches, run_metal_benchmarks);
11+
criterion_main!(benches);

0 commit comments

Comments
 (0)