Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Distributed inference with NCCL #1017

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion mistralrs-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -82,10 +82,11 @@ float8.workspace = true
llguidance = { git = "https://github.com/microsoft/llguidance", rev = "cfef3df97372a7b84d74976ff41cc9cb78bca6cc", default-features = false, features = ["lark"] }
toktrie_hf_tokenizers = { git = "https://github.com/microsoft/llguidance", rev = "cfef3df97372a7b84d74976ff41cc9cb78bca6cc" }
objc = { version = "0.2.7", optional = true }
nix = { version = "0.29", features = ["process", "signal"], optional = true }

[features]
pyo3_macros = ["pyo3"]
cuda = ["candle-core/cuda", "candle-nn/cuda", "dep:bindgen_cuda", "mistralrs-quant/cuda", "dep:mistralrs-paged-attn", "mistralrs-paged-attn/cuda", "float8/cuda"]
cuda = ["candle-core/cuda", "candle-nn/cuda", "dep:bindgen_cuda", "mistralrs-quant/cuda", "dep:mistralrs-paged-attn", "mistralrs-paged-attn/cuda", "float8/cuda", "dep:nix"]
cudnn = ["candle-core/cudnn"]
metal = ["candle-core/metal", "candle-nn/metal", "dep:objc", "dep:mistralrs-paged-attn", "mistralrs-paged-attn/metal"]
flash-attn = ["cuda", "dep:candle-flash-attn"]
Expand Down
25 changes: 25 additions & 0 deletions mistralrs-core/src/pipeline/normal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -336,6 +336,31 @@ impl Loader for NormalLoader {
AttentionImplementation::Eager
};

#[cfg(feature = "cuda")]
{
let num_parallel = 2;
use nix::sys::signal::{kill, Signal};
use nix::unistd::{fork, ForkResult};
let mut pids = Vec::new();
for _ in 0..num_parallel {
match unsafe { fork() } {
Ok(ForkResult::Parent { child, .. }) => {
println!("created child {child}");
pids.push(child);
}
Ok(ForkResult::Child) => {
println!("child executing!!");
break;
}
Err(_) => println!("Fork failed"),
}
}
std::thread::sleep(std::time::Duration::from_secs_f32(5.));
for pid in pids {
kill(pid, Signal::SIGTERM)?;
}
}

let mut model = match self.kind {
ModelKind::Normal => normal_model_loader!(
paths,
Expand Down
3 changes: 2 additions & 1 deletion mistralrs-quant/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,10 @@ float8.workspace = true
once_cell.workspace = true
metal = { workspace = true, optional = true }
thiserror = "1"
cudarc = { version = "0.12.1", features = ["std", "cublas", "cublaslt", "curand", "driver", "nvrtc", "f16", "cuda-version-from-build-system", "dynamic-linking"], default-features=false, optional = true}

[features]
cuda = ["candle-core/cuda", "candle-nn/cuda", "dep:bindgen_cuda"]
cuda = ["candle-core/cuda", "candle-nn/cuda", "dep:bindgen_cuda", "dep:cudarc", "cudarc/nccl"]
metal = ["candle-core/metal", "candle-nn/metal", "dep:metal"]

[build-dependencies]
Expand Down
102 changes: 102 additions & 0 deletions mistralrs-quant/src/distributed/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
use std::{
rc::Rc,
sync::{LazyLock, RwLock},
};

use candle_core::{
backend::BackendStorage,
cuda::{cudarc::driver::DeviceSlice, WrapErr},

Check failure on line 8 in mistralrs-quant/src/distributed/mod.rs

View workflow job for this annotation

GitHub Actions / Clippy

failed to resolve: could not find `cudarc` in `cuda`

Check failure on line 8 in mistralrs-quant/src/distributed/mod.rs

View workflow job for this annotation

GitHub Actions / Clippy

unresolved import `candle_core::cuda::WrapErr`

Check failure on line 8 in mistralrs-quant/src/distributed/mod.rs

View workflow job for this annotation

GitHub Actions / Check (ubuntu-latest, stable)

failed to resolve: could not find `cudarc` in `cuda`

Check failure on line 8 in mistralrs-quant/src/distributed/mod.rs

View workflow job for this annotation

GitHub Actions / Check (ubuntu-latest, stable)

unresolved import `candle_core::cuda::WrapErr`

Check failure on line 8 in mistralrs-quant/src/distributed/mod.rs

View workflow job for this annotation

GitHub Actions / Test Suite (macOS-latest, stable)

failed to resolve: could not find `cudarc` in `cuda`

Check failure on line 8 in mistralrs-quant/src/distributed/mod.rs

View workflow job for this annotation

GitHub Actions / Test Suite (macOS-latest, stable)

unresolved import `candle_core::cuda::WrapErr`

Check failure on line 8 in mistralrs-quant/src/distributed/mod.rs

View workflow job for this annotation

GitHub Actions / Test Suite (ubuntu-latest, stable)

failed to resolve: could not find `cudarc` in `cuda`

Check failure on line 8 in mistralrs-quant/src/distributed/mod.rs

View workflow job for this annotation

GitHub Actions / Test Suite (ubuntu-latest, stable)

unresolved import `candle_core::cuda::WrapErr`

Check failure on line 8 in mistralrs-quant/src/distributed/mod.rs

View workflow job for this annotation

GitHub Actions / Docs

failed to resolve: could not find `cudarc` in `cuda`

Check failure on line 8 in mistralrs-quant/src/distributed/mod.rs

View workflow job for this annotation

GitHub Actions / Docs

unresolved import `candle_core::cuda::WrapErr`
DType, Device, Tensor,
};
use candle_core::{CpuStorage, CustomOp1, Layout, Result, Shape};
use cudarc::nccl::ReduceOp;

Check failure on line 12 in mistralrs-quant/src/distributed/mod.rs

View workflow job for this annotation

GitHub Actions / Clippy

failed to resolve: use of undeclared crate or module `cudarc`

Check failure on line 12 in mistralrs-quant/src/distributed/mod.rs

View workflow job for this annotation

GitHub Actions / Check (ubuntu-latest, stable)

failed to resolve: use of undeclared crate or module `cudarc`

Check failure on line 12 in mistralrs-quant/src/distributed/mod.rs

View workflow job for this annotation

GitHub Actions / Test Suite (macOS-latest, stable)

failed to resolve: use of undeclared crate or module `cudarc`

Check failure on line 12 in mistralrs-quant/src/distributed/mod.rs

View workflow job for this annotation

GitHub Actions / Test Suite (ubuntu-latest, stable)

failed to resolve: use of undeclared crate or module `cudarc`

Check failure on line 12 in mistralrs-quant/src/distributed/mod.rs

View workflow job for this annotation

GitHub Actions / Docs

failed to resolve: use of undeclared crate or module `cudarc`
use cudarc::nccl::{Comm, Id};

Check failure on line 13 in mistralrs-quant/src/distributed/mod.rs

View workflow job for this annotation

GitHub Actions / Clippy

failed to resolve: use of undeclared crate or module `cudarc`

Check failure on line 13 in mistralrs-quant/src/distributed/mod.rs

View workflow job for this annotation

GitHub Actions / Check (ubuntu-latest, stable)

failed to resolve: use of undeclared crate or module `cudarc`

Check failure on line 13 in mistralrs-quant/src/distributed/mod.rs

View workflow job for this annotation

GitHub Actions / Test Suite (macOS-latest, stable)

failed to resolve: use of undeclared crate or module `cudarc`

Check failure on line 13 in mistralrs-quant/src/distributed/mod.rs

View workflow job for this annotation

GitHub Actions / Test Suite (ubuntu-latest, stable)

failed to resolve: use of undeclared crate or module `cudarc`

Check failure on line 13 in mistralrs-quant/src/distributed/mod.rs

View workflow job for this annotation

GitHub Actions / Docs

failed to resolve: use of undeclared crate or module `cudarc`
use half::{bf16, f16};

thread_local! {
static COMM: LazyLock<RwLock<Option<Rc<Comm>>>> = LazyLock::new(|| RwLock::new(None));
}

struct AllReduce {
comm: Rc<Comm>,
op: ReduceOp,
}

unsafe impl Sync for AllReduce {}
unsafe impl Send for AllReduce {}

impl CustomOp1 for AllReduce {
fn name(&self) -> &'static str {
"all-reduce"
}

fn cpu_fwd(&self, _s: &CpuStorage, _l: &Layout) -> Result<(CpuStorage, Shape)> {
candle_core::bail!("all-reduce is not supported on cpu")
}

fn cuda_fwd(
&self,
s: &candle_core::CudaStorage,
l: &Layout,
) -> Result<(candle_core::CudaStorage, Shape)> {
let elem_count = l.shape().elem_count();
let dev = s.device().clone();
let dst = match s.dtype() {
DType::F32 => {
let s = s.as_cuda_slice::<f32>()?;

Check failure on line 46 in mistralrs-quant/src/distributed/mod.rs

View workflow job for this annotation

GitHub Actions / Clippy

no method named `as_cuda_slice` found for reference `&candle_core::CudaStorage` in the current scope

Check failure on line 46 in mistralrs-quant/src/distributed/mod.rs

View workflow job for this annotation

GitHub Actions / Check (ubuntu-latest, stable)

no method named `as_cuda_slice` found for reference `&CudaStorage` in the current scope

Check failure on line 46 in mistralrs-quant/src/distributed/mod.rs

View workflow job for this annotation

GitHub Actions / Test Suite (macOS-latest, stable)

no method named `as_cuda_slice` found for reference `&CudaStorage` in the current scope

Check failure on line 46 in mistralrs-quant/src/distributed/mod.rs

View workflow job for this annotation

GitHub Actions / Test Suite (ubuntu-latest, stable)

no method named `as_cuda_slice` found for reference `&CudaStorage` in the current scope

Check failure on line 46 in mistralrs-quant/src/distributed/mod.rs

View workflow job for this annotation

GitHub Actions / Docs

no method named `as_cuda_slice` found for reference `&CudaStorage` in the current scope
let s = match l.contiguous_offsets() {
Some((0, l)) if l == s.len() => s,
Some(_) | None => candle_core::bail!("input has to be contiguous"),
};
let mut dst = unsafe { dev.alloc::<f32>(elem_count) }.w()?;

Check failure on line 51 in mistralrs-quant/src/distributed/mod.rs

View workflow job for this annotation

GitHub Actions / Clippy

no method named `alloc` found for struct `candle_core::CudaDevice` in the current scope

Check failure on line 51 in mistralrs-quant/src/distributed/mod.rs

View workflow job for this annotation

GitHub Actions / Check (ubuntu-latest, stable)

no method named `alloc` found for struct `CudaDevice` in the current scope

Check failure on line 51 in mistralrs-quant/src/distributed/mod.rs

View workflow job for this annotation

GitHub Actions / Test Suite (macOS-latest, stable)

no method named `alloc` found for struct `CudaDevice` in the current scope

Check failure on line 51 in mistralrs-quant/src/distributed/mod.rs

View workflow job for this annotation

GitHub Actions / Test Suite (ubuntu-latest, stable)

no method named `alloc` found for struct `CudaDevice` in the current scope

Check failure on line 51 in mistralrs-quant/src/distributed/mod.rs

View workflow job for this annotation

GitHub Actions / Docs

no method named `alloc` found for struct `CudaDevice` in the current scope
self.comm
.all_reduce(s, &mut dst, &self.op)
.map_err(candle_core::Error::debug)?;
candle_core::CudaStorage::wrap_cuda_slice(dst, dev)

Check failure on line 55 in mistralrs-quant/src/distributed/mod.rs

View workflow job for this annotation

GitHub Actions / Clippy

no function or associated item named `wrap_cuda_slice` found for struct `candle_core::CudaStorage` in the current scope

Check failure on line 55 in mistralrs-quant/src/distributed/mod.rs

View workflow job for this annotation

GitHub Actions / Check (ubuntu-latest, stable)

no function or associated item named `wrap_cuda_slice` found for struct `CudaStorage` in the current scope

Check failure on line 55 in mistralrs-quant/src/distributed/mod.rs

View workflow job for this annotation

GitHub Actions / Test Suite (macOS-latest, stable)

no function or associated item named `wrap_cuda_slice` found for struct `CudaStorage` in the current scope

Check failure on line 55 in mistralrs-quant/src/distributed/mod.rs

View workflow job for this annotation

GitHub Actions / Test Suite (ubuntu-latest, stable)

no function or associated item named `wrap_cuda_slice` found for struct `CudaStorage` in the current scope

Check failure on line 55 in mistralrs-quant/src/distributed/mod.rs

View workflow job for this annotation

GitHub Actions / Docs

no function or associated item named `wrap_cuda_slice` found for struct `CudaStorage` in the current scope
}
DType::BF16 => {
let s = s.as_cuda_slice::<bf16>()?;

Check failure on line 58 in mistralrs-quant/src/distributed/mod.rs

View workflow job for this annotation

GitHub Actions / Clippy

no method named `as_cuda_slice` found for reference `&candle_core::CudaStorage` in the current scope

Check failure on line 58 in mistralrs-quant/src/distributed/mod.rs

View workflow job for this annotation

GitHub Actions / Check (ubuntu-latest, stable)

no method named `as_cuda_slice` found for reference `&CudaStorage` in the current scope

Check failure on line 58 in mistralrs-quant/src/distributed/mod.rs

View workflow job for this annotation

GitHub Actions / Test Suite (macOS-latest, stable)

no method named `as_cuda_slice` found for reference `&CudaStorage` in the current scope

Check failure on line 58 in mistralrs-quant/src/distributed/mod.rs

View workflow job for this annotation

GitHub Actions / Test Suite (ubuntu-latest, stable)

no method named `as_cuda_slice` found for reference `&CudaStorage` in the current scope

Check failure on line 58 in mistralrs-quant/src/distributed/mod.rs

View workflow job for this annotation

GitHub Actions / Docs

no method named `as_cuda_slice` found for reference `&CudaStorage` in the current scope
let s = match l.contiguous_offsets() {
Some((0, l)) if l == s.len() => s,
Some(_) | None => candle_core::bail!("input has to be contiguous"),
};
let mut dst = unsafe { dev.alloc::<bf16>(elem_count) }.w()?;

Check failure on line 63 in mistralrs-quant/src/distributed/mod.rs

View workflow job for this annotation

GitHub Actions / Clippy

no method named `alloc` found for struct `candle_core::CudaDevice` in the current scope

Check failure on line 63 in mistralrs-quant/src/distributed/mod.rs

View workflow job for this annotation

GitHub Actions / Check (ubuntu-latest, stable)

no method named `alloc` found for struct `CudaDevice` in the current scope

Check failure on line 63 in mistralrs-quant/src/distributed/mod.rs

View workflow job for this annotation

GitHub Actions / Test Suite (macOS-latest, stable)

no method named `alloc` found for struct `CudaDevice` in the current scope

Check failure on line 63 in mistralrs-quant/src/distributed/mod.rs

View workflow job for this annotation

GitHub Actions / Test Suite (ubuntu-latest, stable)

no method named `alloc` found for struct `CudaDevice` in the current scope

Check failure on line 63 in mistralrs-quant/src/distributed/mod.rs

View workflow job for this annotation

GitHub Actions / Docs

no method named `alloc` found for struct `CudaDevice` in the current scope
self.comm
.all_reduce(s, &mut dst, &self.op)
.map_err(candle_core::Error::debug)?;
candle_core::CudaStorage::wrap_cuda_slice(dst, dev)

Check failure on line 67 in mistralrs-quant/src/distributed/mod.rs

View workflow job for this annotation

GitHub Actions / Clippy

no function or associated item named `wrap_cuda_slice` found for struct `candle_core::CudaStorage` in the current scope

Check failure on line 67 in mistralrs-quant/src/distributed/mod.rs

View workflow job for this annotation

GitHub Actions / Check (ubuntu-latest, stable)

no function or associated item named `wrap_cuda_slice` found for struct `CudaStorage` in the current scope

Check failure on line 67 in mistralrs-quant/src/distributed/mod.rs

View workflow job for this annotation

GitHub Actions / Test Suite (macOS-latest, stable)

no function or associated item named `wrap_cuda_slice` found for struct `CudaStorage` in the current scope

Check failure on line 67 in mistralrs-quant/src/distributed/mod.rs

View workflow job for this annotation

GitHub Actions / Test Suite (ubuntu-latest, stable)

no function or associated item named `wrap_cuda_slice` found for struct `CudaStorage` in the current scope

Check failure on line 67 in mistralrs-quant/src/distributed/mod.rs

View workflow job for this annotation

GitHub Actions / Docs

no function or associated item named `wrap_cuda_slice` found for struct `CudaStorage` in the current scope
}
DType::F16 => {
let s = s.as_cuda_slice::<f16>()?;
let s = match l.contiguous_offsets() {
Some((0, l)) if l == s.len() => s,
Some(_) | None => candle_core::bail!("input has to be contiguous"),
};
let mut dst = unsafe { dev.alloc::<f16>(elem_count) }.w()?;
self.comm
.all_reduce(s, &mut dst, &self.op)
.map_err(candle_core::Error::debug)?;
candle_core::CudaStorage::wrap_cuda_slice(dst, dev)
}
dtype => candle_core::bail!("unsupported dtype {dtype:?}"),
};
Ok((dst, l.shape().clone()))
}
}

pub fn init_comm(id: Id, rank: usize, world_size: usize, dev: &Device) -> Result<()> {
let dev = dev.as_cuda_device()?;
let comm = Comm::from_rank(dev.cuda_device(), rank, world_size, id)
.map_err(candle_core::Error::debug)?;
COMM.with(|x| {
*x.write().unwrap() = Some(Rc::new(comm));
});
Ok(())
}

/// CUDA all-reduce operation:
/// https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/operations.html#allreduce
pub fn all_reduce(x: &Tensor, op: ReduceOp) -> Result<Tensor> {
let comm = COMM.with(|x| x.read().unwrap().as_ref().unwrap().clone());
x.apply_op1_no_bwd(&AllReduce { comm, op })
}
1 change: 1 addition & 0 deletions mistralrs-quant/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ mod metal_kernels;

mod bitsandbytes;
mod cublaslt;
mod distributed;
mod dummy;
mod fp8;
mod gguf;
Expand Down
Loading