diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..2006ccd --- /dev/null +++ b/.dockerignore @@ -0,0 +1,2 @@ +*.bin +example_test/ \ No newline at end of file diff --git a/.gitignore b/.gitignore index 5a48dd5..032b052 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ /target -*.bin \ No newline at end of file +*.bin +/example_test \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index 63e0b9b..5be9d88 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -105,7 +105,7 @@ dependencies = [ [[package]] name = "llama_cpp_rs" -version = "0.1.2" +version = "0.2.0" dependencies = [ "bindgen", "cc", diff --git a/Cargo.toml b/Cargo.toml index 2def871..fb6b762 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,12 +2,24 @@ authors = ["mdrokz "] name = "llama_cpp_rs" description = "Rust bindings for LLAMA.CPP inference" -categories = ["api-bindings","development-tools::ffi","development-tools::build-utils","science"] -keywords = ["machine-learning","api-bindings","llama","llama-cpp","inference"] +categories = [ + "api-bindings", + "development-tools::ffi", + "development-tools::build-utils", + "science", +] +keywords = [ + "machine-learning", + "api-bindings", + "llama", + "llama-cpp", + "inference", +] +exclude = ["examples", "example_test"] license-file = "LICENSE" readme = "README.md" repository = "https://github.com/mdrokz/rust-llama.cpp" -version = "0.1.2" +version = "0.2.0" edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html @@ -21,4 +33,10 @@ bindgen = "0.66.1" [lib] name = "llama_cpp_rs" -path = "src/lib.rs" \ No newline at end of file +path = "src/lib.rs" + +[features] +opencl = [] +cuda = [] +openblas = [] +blis = [] \ No newline at end of file diff --git a/README.md b/README.md index dcf615d..3fec9b6 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,7 @@ fn main() { ) .unwrap(); - let mut predict_options = PredictOptions { + let predict_options = PredictOptions { token_callback: Some(Box::new(|token| { println!("token1: {}", token); @@ -61,12 +61,20 @@ fn main() { ``` +## Examples + +The examples contain dockerfiles to run them + +see [examples](https://github.com/mdrokz/rust-llama.cpp/examples/README.md) + ## TODO -- [ ] Implement support for cublas,openBLAS & OpenCL +- [x] Implement support for cublas,openBLAS & OpenCL - [ ] Implement support for GPU (Metal) - [ ] Add some test cases -- [ ] Add some proper examples +- [ ] Support for fetching models through http & S3 +- [ ] Sync with latest master & support GGUF +- [x] Add some proper examples ## LICENSE diff --git a/build.rs b/build.rs index 4c308bd..abe37b8 100644 --- a/build.rs +++ b/build.rs @@ -1,7 +1,9 @@ use std::env; use std::path::PathBuf; -fn main() { +use cc::Build; + +fn compile_bindings(out_path: &PathBuf) { let bindings = bindgen::Builder::default() .header("./binding.h") .blocklist_function("tokenCallback") @@ -9,52 +11,180 @@ fn main() { .generate() .expect("Unable to generate bindings"); - let out_path = PathBuf::from(env::var("OUT_DIR").unwrap()); - bindings - .write_to_file(out_path.join("bindings.rs")) + .write_to_file(&out_path.join("bindings.rs")) .expect("Couldn't write bindings!"); +} + +fn compile_opencl(cx: &mut Build, cxx: &mut Build) { + cx.flag("-DGGML_USE_CLBLAST"); + cxx.flag("-DGGML_USE_CLBLAST"); + + if cfg!(target_os = "linux") { + println!("cargo:rustc-link-lib=OpenCL"); + println!("cargo:rustc-link-lib=clblast"); + } else if cfg!(target_os = "macos") { + println!("cargo:rustc-link-lib=framework=OpenCL"); + println!("cargo:rustc-link-lib=clblast"); + } + + cxx.file("./llama.cpp/ggml-opencl.cpp"); +} + +fn compile_openblas(cx: &mut Build) { + cx.flag("-DGGML_USE_OPENBLAS") + .include("/usr/local/include/openblas") + .include("/usr/local/include/openblas"); + println!("cargo:rustc-link-lib=openblas"); +} + +fn compile_blis(cx: &mut Build) { + cx.flag("-DGGML_USE_OPENBLAS") + .include("/usr/local/include/blis") + .include("/usr/local/include/blis"); + println!("cargo:rustc-link-search=native=/usr/local/lib"); + println!("cargo:rustc-link-lib=blis"); +} + +fn compile_cuda(cxx_flags: &str) { + println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64"); + println!("cargo:rustc-link-search=native=/opt/cuda/lib64"); + + if let Ok(cuda_path) = std::env::var("CUDA_PATH") { + println!( + "cargo:rustc-link-search=native={}/targets/x86_64-linux/lib", + cuda_path + ); + } + + let libs = "cublas culibos cudart cublasLt pthread dl rt"; + + for lib in libs.split_whitespace() { + println!("cargo:rustc-link-lib={}", lib); + } + + let mut nvcc = cc::Build::new(); + + let env_flags = vec![ + ("LLAMA_CUDA_DMMV_X=32", "-DGGML_CUDA_DMMV_X"), + ("LLAMA_CUDA_DMMV_Y=1", "-DGGML_CUDA_DMMV_Y"), + ("LLAMA_CUDA_KQUANTS_ITER=2", "-DK_QUANTS_PER_ITERATION"), + ]; + + let nvcc_flags = "--forward-unknown-to-host-compiler -arch=native "; + + for nvcc_flag in nvcc_flags.split_whitespace() { + nvcc.flag(nvcc_flag); + } + + for cxx_flag in cxx_flags.split_whitespace() { + nvcc.flag(cxx_flag); + } + + for env_flag in env_flags { + let mut flag_split = env_flag.0.split("="); + if let Ok(val) = std::env::var(flag_split.next().unwrap()) { + nvcc.flag(&format!("{}={}", env_flag.1, val)); + } else { + nvcc.flag(&format!("{}={}", env_flag.1, flag_split.next().unwrap())); + } + } + + nvcc.compiler("nvcc") + .file("./llama.cpp/ggml-cuda.cu") + .flag("-Wno-pedantic") + .include("./llama.cpp/ggml-cuda.h") + .compile("ggml-cuda"); +} + +fn compile_ggml(cx: &mut Build, cx_flags: &str) { + for cx_flag in cx_flags.split_whitespace() { + cx.flag(cx_flag); + } + + cx.include("./llama.cpp") + .file("./llama.cpp/ggml.c") + .cpp(false) + .compile("ggml"); +} + +fn compile_llama(cxx: &mut Build, cxx_flags: &str, out_path: &PathBuf, ggml_type: &str) { + for cxx_flag in cxx_flags.split_whitespace() { + cxx.flag(cxx_flag); + } + + let ggml_obj = PathBuf::from(&out_path).join("llama.cpp/ggml.o"); + + cxx.object(ggml_obj); + + if !ggml_type.is_empty() { + let ggml_feature_obj = + PathBuf::from(&out_path).join(format!("llama.cpp/ggml-{}.o", ggml_type)); + cxx.object(ggml_feature_obj); + } + + cxx.shared_flag(true) + .file("./llama.cpp/examples/common.cpp") + .file("./llama.cpp/llama.cpp") + .file("./binding.cpp") + .cpp(true) + .compile("binding"); +} + +fn main() { + let out_path = PathBuf::from(env::var("OUT_DIR").expect("No out dir found")); + + compile_bindings(&out_path); let mut cx_flags = String::from("-Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -march=native -mtune=native"); let mut cxx_flags = String::from("-Wall -Wdeprecated-declarations -Wunused-but-set-variable -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -march=native -mtune=native"); - + // check if os is linux // if so, add -fPIC to cxx_flags if cfg!(target_os = "linux") { cx_flags.push_str(" -pthread"); cxx_flags.push_str(" -fPIC -pthread"); } - - let mut cbuild = &mut cc::Build::new(); - - let mut ccbuild = &mut cc::Build::new(); - - for cx_flag in cx_flags.split(" ").into_iter() { - cbuild = cbuild.flag(cx_flag); - } - - for cxx_flag in cxx_flags.split(" ").into_iter() { - ccbuild = ccbuild.flag(cxx_flag); - } - - - cbuild - .include("./llama.cpp") - .file("./llama.cpp/ggml.c") - .cpp(false) - .compile("ggml"); - - let out_dir = env::var("OUT_DIR").unwrap(); - let ggml_obj = PathBuf::from(out_dir).join("llama.cpp/ggml.o"); - - ccbuild - .include("./llama.cpp/examples") - .include("./llama.cpp") - .shared_flag(true) - .object(ggml_obj) - .file("./llama.cpp/examples/common.cpp") - .file("./llama.cpp/llama.cpp") - .file("./binding.cpp") - .cpp(true) - .compile("binding"); + + let mut cx = cc::Build::new(); + + let mut cxx = cc::Build::new(); + + let mut ggml_type = String::new(); + + cxx.include("./llama.cpp/examples").include("./llama.cpp"); + + if cfg!(feature = "opencl") { + compile_opencl(&mut cx, &mut cxx); + ggml_type = "opencl".to_string(); + } else if cfg!(feature = "openblas") { + compile_openblas(&mut cx); + } else if cfg!(feature = "blis") { + compile_blis(&mut cx); + } + + if cfg!(feature = "cuda") { + cx_flags.push_str(" -DGGML_USE_CUBLAS"); + cxx_flags.push_str(" -DGGML_USE_CUBLAS"); + + cx.include("/usr/local/cuda/include") + .include("/opt/cuda/include"); + cxx.include("/usr/local/cuda/include") + .include("/opt/cuda/include"); + + if let Ok(cuda_path) = std::env::var("CUDA_PATH") { + cx.include(format!("{}/targets/x86_64-linux/include", cuda_path)); + cxx.include(format!("{}/targets/x86_64-linux/include", cuda_path)); + } + + compile_ggml(&mut cx, &cx_flags); + + compile_cuda(&cxx_flags); + + compile_llama(&mut cxx, &cxx_flags, &out_path, "cuda"); + } else { + compile_ggml(&mut cx, &cx_flags); + + compile_llama(&mut cxx, &cxx_flags, &out_path, &ggml_type); + } } diff --git a/examples/README.md b/examples/README.md new file mode 100644 index 0000000..6b670af --- /dev/null +++ b/examples/README.md @@ -0,0 +1,64 @@ +> [!IMPORTANT] +> This was tested with Docker running on Linux & Windows on my Linux PC with RX 6700 XT GPU & my brothers Windows PC with NVIDIA RTX 2060 SUPER.
If you can test it on other GPUs & Platforms, please update this `README.md` with a PR!
+ +## Supported / Tested + +- AMD RX 6700 XT / Fedora 37 +- NVIDIA RTX 2060 Super / Windows 11 Docker + +# Examples + +There are 3 examples basic, cuda and opencl each of them have their own Dockerfile except the basic example. + +# basic + +A simple example that runs inference on the default options: + +``` +cargo run --release +``` + +# cuda + +A example to use nvidia GPU's with the cuda feature: + +firstly build the image from the root of the repository + +``` +docker build -f examples/cuda/Dockerfile . -t llama_cuda +``` + +then you can run it: + +### linux + +``` +docker run --device=/dev/dri:/dev/dri --volume= +:/models llama_cuda +``` + +### windows + +``` +docker run --volume=:/models --gpus all llama_cuda +``` + + +# opencl + +A example to run CLBlast supported GPUs: + +firstly build the image from the root of the repository + +``` +docker build -f examples/opencl/Dockerfile . -t llama_opencl +``` + +then you can run it: + +### linux + +``` +docker run --device=/dev/dri:/dev/dri --volume= +:/models llama_opencl +``` \ No newline at end of file diff --git a/examples/basic/Cargo.toml b/examples/basic/Cargo.toml new file mode 100644 index 0000000..cb00112 --- /dev/null +++ b/examples/basic/Cargo.toml @@ -0,0 +1,10 @@ +[package] +authors = ["mdrokz "] +name = "llama_basic" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +llama_cpp_rs = {path = "../../"} \ No newline at end of file diff --git a/examples/basic/src/main.rs b/examples/basic/src/main.rs new file mode 100644 index 0000000..1ec4ff3 --- /dev/null +++ b/examples/basic/src/main.rs @@ -0,0 +1,34 @@ +use llama_cpp_rs::{ + options::{ModelOptions, PredictOptions}, + LLama, +}; + +fn main() { + let model_options = ModelOptions::default(); + + let llama = LLama::new( + "./.bin".into(), + &model_options, + ) + .unwrap(); + + let predict_options = PredictOptions { + tokens: 0, + threads: 14, + top_k: 90, + top_p: 0.86, + token_callback: Some(Box::new(|token| { + println!("token1: {}", token); + + true + })), + ..Default::default() + }; + + llama + .predict( + "what are the national animals of india".into(), + predict_options, + ) + .unwrap(); +} \ No newline at end of file diff --git a/examples/cuda/Cargo.toml b/examples/cuda/Cargo.toml new file mode 100644 index 0000000..331aa2e --- /dev/null +++ b/examples/cuda/Cargo.toml @@ -0,0 +1,10 @@ +[package] +authors = ["mdrokz "] +name = "llama_cuda" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +llama_cpp_rs = {path = "../../", features = ["cuda"]} \ No newline at end of file diff --git a/examples/cuda/Dockerfile b/examples/cuda/Dockerfile new file mode 100644 index 0000000..d18c588 --- /dev/null +++ b/examples/cuda/Dockerfile @@ -0,0 +1,24 @@ +FROM nvidia/cuda:12.2.0-devel-ubuntu20.04 + +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get update && apt-get install -y \ + git \ + cmake \ + clang \ + cargo \ + nano \ + software-properties-common + + +RUN apt-get clean && \ +rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +COPY . . + +RUN cd examples/cuda && \ + cargo build --release + +CMD ["./examples/cuda/target/release/llama_cuda"] \ No newline at end of file diff --git a/examples/cuda/src/main.rs b/examples/cuda/src/main.rs new file mode 100644 index 0000000..fbe508a --- /dev/null +++ b/examples/cuda/src/main.rs @@ -0,0 +1,33 @@ +use llama_cpp_rs::{ + options::{ModelOptions, PredictOptions}, + LLama, +}; + +fn main() { + let model_options = ModelOptions { + n_gpu_layers: 12, + ..Default::default() + }; + + let llama = LLama::new("/models/.bin".into(), &model_options).unwrap(); + + let predict_options = PredictOptions { + tokens: 0, + threads: 14, + top_k: 90, + top_p: 0.86, + token_callback: Some(Box::new(|token| { + println!("token1: {}", token); + + true + })), + ..Default::default() + }; + + llama + .predict( + "what are the national animals of india".into(), + predict_options, + ) + .unwrap(); +} diff --git a/examples/opencl/Cargo.toml b/examples/opencl/Cargo.toml new file mode 100644 index 0000000..49e5ef0 --- /dev/null +++ b/examples/opencl/Cargo.toml @@ -0,0 +1,10 @@ +[package] +authors = ["mdrokz "] +name = "llama_opencl" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +llama_cpp_rs = {path = "../../", features = ["opencl"]} \ No newline at end of file diff --git a/examples/opencl/Dockerfile b/examples/opencl/Dockerfile new file mode 100644 index 0000000..0b51118 --- /dev/null +++ b/examples/opencl/Dockerfile @@ -0,0 +1,29 @@ +FROM cebxan/amdgpu-opencl + +# install common dependencies +RUN apt-get update && apt-get install -y \ + git \ + cmake \ + clang \ + cargo \ + nano \ + clinfo \ + software-properties-common \ + opencl-headers \ + ocl-icd-libopencl1 \ + ocl-icd-opencl-dev + + +# install clblast +RUN add-apt-repository ppa:cnugteren/clblast && \ + apt-get update && \ + apt-get install -y libclblast-dev + +WORKDIR /app + +COPY . . + +RUN cd examples/opencl && \ + cargo build --release + +CMD ["./examples/opencl/target/release/llama_opencl"] \ No newline at end of file diff --git a/examples/opencl/src/main.rs b/examples/opencl/src/main.rs new file mode 100644 index 0000000..fbe508a --- /dev/null +++ b/examples/opencl/src/main.rs @@ -0,0 +1,33 @@ +use llama_cpp_rs::{ + options::{ModelOptions, PredictOptions}, + LLama, +}; + +fn main() { + let model_options = ModelOptions { + n_gpu_layers: 12, + ..Default::default() + }; + + let llama = LLama::new("/models/.bin".into(), &model_options).unwrap(); + + let predict_options = PredictOptions { + tokens: 0, + threads: 14, + top_k: 90, + top_p: 0.86, + token_callback: Some(Box::new(|token| { + println!("token1: {}", token); + + true + })), + ..Default::default() + }; + + llama + .predict( + "what are the national animals of india".into(), + predict_options, + ) + .unwrap(); +}