Skip to content

Implement blas support #7

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 16 commits into from
Aug 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
*.bin
example_test/
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
/target
*.bin
*.bin
/example_test
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

26 changes: 22 additions & 4 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,24 @@
authors = ["mdrokz <[email protected]>"]
name = "llama_cpp_rs"
description = "Rust bindings for LLAMA.CPP inference"
categories = ["api-bindings","development-tools::ffi","development-tools::build-utils","science"]
keywords = ["machine-learning","api-bindings","llama","llama-cpp","inference"]
categories = [
"api-bindings",
"development-tools::ffi",
"development-tools::build-utils",
"science",
]
keywords = [
"machine-learning",
"api-bindings",
"llama",
"llama-cpp",
"inference",
]
exclude = ["examples", "example_test"]
license-file = "LICENSE"
readme = "README.md"
repository = "https://github.com/mdrokz/rust-llama.cpp"
version = "0.1.2"
version = "0.2.0"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
Expand All @@ -21,4 +33,10 @@ bindgen = "0.66.1"

[lib]
name = "llama_cpp_rs"
path = "src/lib.rs"
path = "src/lib.rs"

[features]
opencl = []
cuda = []
openblas = []
blis = []
14 changes: 11 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ fn main() {
)
.unwrap();

let mut predict_options = PredictOptions {
let predict_options = PredictOptions {
token_callback: Some(Box::new(|token| {
println!("token1: {}", token);

Expand All @@ -61,12 +61,20 @@ fn main() {

```

## Examples

The examples contain dockerfiles to run them

see [examples](https://github.com/mdrokz/rust-llama.cpp/examples/README.md)

## TODO

- [ ] Implement support for cublas,openBLAS & OpenCL
- [x] Implement support for cublas,openBLAS & OpenCL
- [ ] Implement support for GPU (Metal)
- [ ] Add some test cases
- [ ] Add some proper examples
- [ ] Support for fetching models through http & S3
- [ ] Sync with latest master & support GGUF
- [x] Add some proper examples

## LICENSE

Expand Down
206 changes: 168 additions & 38 deletions build.rs
Original file line number Diff line number Diff line change
@@ -1,60 +1,190 @@
use std::env;
use std::path::PathBuf;

fn main() {
use cc::Build;

fn compile_bindings(out_path: &PathBuf) {
let bindings = bindgen::Builder::default()
.header("./binding.h")
.blocklist_function("tokenCallback")
.parse_callbacks(Box::new(bindgen::CargoCallbacks))
.generate()
.expect("Unable to generate bindings");

let out_path = PathBuf::from(env::var("OUT_DIR").unwrap());

bindings
.write_to_file(out_path.join("bindings.rs"))
.write_to_file(&out_path.join("bindings.rs"))
.expect("Couldn't write bindings!");
}

fn compile_opencl(cx: &mut Build, cxx: &mut Build) {
cx.flag("-DGGML_USE_CLBLAST");
cxx.flag("-DGGML_USE_CLBLAST");

if cfg!(target_os = "linux") {
println!("cargo:rustc-link-lib=OpenCL");
println!("cargo:rustc-link-lib=clblast");
} else if cfg!(target_os = "macos") {
println!("cargo:rustc-link-lib=framework=OpenCL");
println!("cargo:rustc-link-lib=clblast");
}

cxx.file("./llama.cpp/ggml-opencl.cpp");
}

fn compile_openblas(cx: &mut Build) {
cx.flag("-DGGML_USE_OPENBLAS")
.include("/usr/local/include/openblas")
.include("/usr/local/include/openblas");
println!("cargo:rustc-link-lib=openblas");
}

fn compile_blis(cx: &mut Build) {
cx.flag("-DGGML_USE_OPENBLAS")
.include("/usr/local/include/blis")
.include("/usr/local/include/blis");
println!("cargo:rustc-link-search=native=/usr/local/lib");
println!("cargo:rustc-link-lib=blis");
}

fn compile_cuda(cxx_flags: &str) {
println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64");
println!("cargo:rustc-link-search=native=/opt/cuda/lib64");

if let Ok(cuda_path) = std::env::var("CUDA_PATH") {
println!(
"cargo:rustc-link-search=native={}/targets/x86_64-linux/lib",
cuda_path
);
}

let libs = "cublas culibos cudart cublasLt pthread dl rt";

for lib in libs.split_whitespace() {
println!("cargo:rustc-link-lib={}", lib);
}

let mut nvcc = cc::Build::new();

let env_flags = vec![
("LLAMA_CUDA_DMMV_X=32", "-DGGML_CUDA_DMMV_X"),
("LLAMA_CUDA_DMMV_Y=1", "-DGGML_CUDA_DMMV_Y"),
("LLAMA_CUDA_KQUANTS_ITER=2", "-DK_QUANTS_PER_ITERATION"),
];

let nvcc_flags = "--forward-unknown-to-host-compiler -arch=native ";

for nvcc_flag in nvcc_flags.split_whitespace() {
nvcc.flag(nvcc_flag);
}

for cxx_flag in cxx_flags.split_whitespace() {
nvcc.flag(cxx_flag);
}

for env_flag in env_flags {
let mut flag_split = env_flag.0.split("=");
if let Ok(val) = std::env::var(flag_split.next().unwrap()) {
nvcc.flag(&format!("{}={}", env_flag.1, val));
} else {
nvcc.flag(&format!("{}={}", env_flag.1, flag_split.next().unwrap()));
}
}

nvcc.compiler("nvcc")
.file("./llama.cpp/ggml-cuda.cu")
.flag("-Wno-pedantic")
.include("./llama.cpp/ggml-cuda.h")
.compile("ggml-cuda");
}

fn compile_ggml(cx: &mut Build, cx_flags: &str) {
for cx_flag in cx_flags.split_whitespace() {
cx.flag(cx_flag);
}

cx.include("./llama.cpp")
.file("./llama.cpp/ggml.c")
.cpp(false)
.compile("ggml");
}

fn compile_llama(cxx: &mut Build, cxx_flags: &str, out_path: &PathBuf, ggml_type: &str) {
for cxx_flag in cxx_flags.split_whitespace() {
cxx.flag(cxx_flag);
}

let ggml_obj = PathBuf::from(&out_path).join("llama.cpp/ggml.o");

cxx.object(ggml_obj);

if !ggml_type.is_empty() {
let ggml_feature_obj =
PathBuf::from(&out_path).join(format!("llama.cpp/ggml-{}.o", ggml_type));
cxx.object(ggml_feature_obj);
}

cxx.shared_flag(true)
.file("./llama.cpp/examples/common.cpp")
.file("./llama.cpp/llama.cpp")
.file("./binding.cpp")
.cpp(true)
.compile("binding");
}

fn main() {
let out_path = PathBuf::from(env::var("OUT_DIR").expect("No out dir found"));

compile_bindings(&out_path);

let mut cx_flags = String::from("-Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -march=native -mtune=native");
let mut cxx_flags = String::from("-Wall -Wdeprecated-declarations -Wunused-but-set-variable -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -march=native -mtune=native");

// check if os is linux
// if so, add -fPIC to cxx_flags
if cfg!(target_os = "linux") {
cx_flags.push_str(" -pthread");
cxx_flags.push_str(" -fPIC -pthread");
}

let mut cbuild = &mut cc::Build::new();

let mut ccbuild = &mut cc::Build::new();

for cx_flag in cx_flags.split(" ").into_iter() {
cbuild = cbuild.flag(cx_flag);
}

for cxx_flag in cxx_flags.split(" ").into_iter() {
ccbuild = ccbuild.flag(cxx_flag);
}


cbuild
.include("./llama.cpp")
.file("./llama.cpp/ggml.c")
.cpp(false)
.compile("ggml");

let out_dir = env::var("OUT_DIR").unwrap();
let ggml_obj = PathBuf::from(out_dir).join("llama.cpp/ggml.o");

ccbuild
.include("./llama.cpp/examples")
.include("./llama.cpp")
.shared_flag(true)
.object(ggml_obj)
.file("./llama.cpp/examples/common.cpp")
.file("./llama.cpp/llama.cpp")
.file("./binding.cpp")
.cpp(true)
.compile("binding");

let mut cx = cc::Build::new();

let mut cxx = cc::Build::new();

let mut ggml_type = String::new();

cxx.include("./llama.cpp/examples").include("./llama.cpp");

if cfg!(feature = "opencl") {
compile_opencl(&mut cx, &mut cxx);
ggml_type = "opencl".to_string();
} else if cfg!(feature = "openblas") {
compile_openblas(&mut cx);
} else if cfg!(feature = "blis") {
compile_blis(&mut cx);
}

if cfg!(feature = "cuda") {
cx_flags.push_str(" -DGGML_USE_CUBLAS");
cxx_flags.push_str(" -DGGML_USE_CUBLAS");

cx.include("/usr/local/cuda/include")
.include("/opt/cuda/include");
cxx.include("/usr/local/cuda/include")
.include("/opt/cuda/include");

if let Ok(cuda_path) = std::env::var("CUDA_PATH") {
cx.include(format!("{}/targets/x86_64-linux/include", cuda_path));
cxx.include(format!("{}/targets/x86_64-linux/include", cuda_path));
}

compile_ggml(&mut cx, &cx_flags);

compile_cuda(&cxx_flags);

compile_llama(&mut cxx, &cxx_flags, &out_path, "cuda");
} else {
compile_ggml(&mut cx, &cx_flags);

compile_llama(&mut cxx, &cxx_flags, &out_path, &ggml_type);
}
}
64 changes: 64 additions & 0 deletions examples/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
> [!IMPORTANT]
> This was tested with Docker running on Linux & Windows on my Linux PC with RX 6700 XT GPU & my brothers Windows PC with NVIDIA RTX 2060 SUPER. <br>If you can test it on other GPUs & Platforms, please update this `README.md` with a PR!<br>

## Supported / Tested

- AMD RX 6700 XT / Fedora 37
- NVIDIA RTX 2060 Super / Windows 11 Docker

# Examples

There are 3 examples basic, cuda and opencl each of them have their own Dockerfile except the basic example.

# basic

A simple example that runs inference on the default options:

```
cargo run --release
```

# cuda

A example to use nvidia GPU's with the cuda feature:

firstly build the image from the root of the repository

```
docker build -f examples/cuda/Dockerfile . -t llama_cuda
```

then you can run it:

### linux

```
docker run --device=/dev/dri:/dev/dri --volume=<your directory that contains the models>
:/models llama_cuda
```

### windows

```
docker run --volume=<your directory that contains the models>:/models --gpus all llama_cuda
```


# opencl

A example to run CLBlast supported GPUs:

firstly build the image from the root of the repository

```
docker build -f examples/opencl/Dockerfile . -t llama_opencl
```

then you can run it:

### linux

```
docker run --device=/dev/dri:/dev/dri --volume=<your directory that contains the models>
:/models llama_opencl
```
Loading