diff --git a/orca-core/src/llm/quantized.rs b/orca-core/src/llm/quantized.rs index 3f75a58..6e8c909 100644 --- a/orca-core/src/llm/quantized.rs +++ b/orca-core/src/llm/quantized.rs @@ -198,7 +198,7 @@ impl Quantized { let mut total_size_in_bytes = 0; for (_, tensor) in model.tensor_infos.iter() { let elem_count = tensor.shape.elem_count(); - total_size_in_bytes += elem_count * tensor.ggml_dtype.type_size() / tensor.ggml_dtype.blck_size(); + total_size_in_bytes += elem_count * tensor.ggml_dtype.type_size() / tensor.ggml_dtype.block_size(); } log::info!( "loaded {:?} tensors ({}) in {:.2}s", @@ -206,14 +206,14 @@ impl Quantized { &format_size(total_size_in_bytes), start.elapsed().as_secs_f32(), ); - Some(ModelWeights::from_gguf(model, &mut file)?) + Some(ModelWeights::from_gguf(model, &mut file, &Device::Cpu)?) } Some("ggml" | "bin") | Some(_) | None => { - let model = ggml_file::Content::read(&mut file)?; + let model = ggml_file::Content::read(&mut file, &Device::Cpu)?; let mut total_size_in_bytes = 0; for (_, tensor) in model.tensors.iter() { let elem_count = tensor.shape().elem_count(); - total_size_in_bytes += elem_count * tensor.dtype().type_size() / tensor.dtype().blck_size(); + total_size_in_bytes += elem_count * tensor.dtype().type_size() / tensor.dtype().block_size(); } log::info!( "loaded {:?} tensors ({}) in {:.2}s", diff --git a/orca-models/src/mistral.rs b/orca-models/src/mistral.rs index ab5520b..995e4db 100644 --- a/orca-models/src/mistral.rs +++ b/orca-models/src/mistral.rs @@ -1,4 +1,5 @@ use crate::utils::text_generation::{Model, TextGeneration}; +use candle::Device; use candle_transformers::models::mistral; use candle_transformers::models::quantized_mistral; @@ -72,7 +73,7 @@ impl Mistral { P: AsRef, { let cfg = mistral::Config::config_7b_v0_1(config.flash_attn); - let vb = candle_transformers::quantized_var_builder::VarBuilder::from_gguf(weights)?; + let vb = candle_transformers::quantized_var_builder::VarBuilder::from_gguf(weights, &Device::Cpu)?; let model = quantized_mistral::Model::new(&cfg, vb)?; let tokenizer = tokenizers::Tokenizer::from_file(tokenizer).map_err(|m| anyhow::anyhow!(m))?; Ok(Self { @@ -88,7 +89,7 @@ impl Mistral { pub fn from_stream(weights: Vec, tokenizer: Vec, config: Config) -> anyhow::Result { let cfg = mistral::Config::config_7b_v0_1(config.flash_attn); - let vb = candle_transformers::quantized_var_builder::VarBuilder::from_gguf_buffer(&weights)?; + let vb = candle_transformers::quantized_var_builder::VarBuilder::from_gguf_buffer(&weights, &Device::Cpu)?; let model = quantized_mistral::Model::new(&cfg, vb)?; let tokenizer = tokenizers::Tokenizer::from_bytes(tokenizer).map_err(|m| anyhow::anyhow!(m))?; Ok(Self { diff --git a/orca-models/src/quantized.rs b/orca-models/src/quantized.rs index e6ce59d..f48bb6b 100644 --- a/orca-models/src/quantized.rs +++ b/orca-models/src/quantized.rs @@ -65,7 +65,7 @@ impl Quantized { pub fn from_gguf_stream(model: Vec, tokenizer: Vec, config: Config) -> anyhow::Result { let mut model_reader = std::io::Cursor::new(model); let model_content = gguf_file::Content::read(&mut model_reader)?; - let model = ModelWeights::from_gguf(model_content, &mut model_reader)?; + let model = ModelWeights::from_gguf(model_content, &mut model_reader, &Device::Cpu)?; let tokenizer = tokenizers::Tokenizer::from_bytes(tokenizer).map_err(|m| anyhow::anyhow!(m))?; Ok(Self { model, @@ -80,7 +80,7 @@ impl Quantized { pub fn from_ggml_stream(model: Vec, tokenizer: Vec, config: Config) -> anyhow::Result { let mut model_reader = std::io::Cursor::new(model); - let model_content = ggml_file::Content::read(&mut model_reader)?; + let model_content = ggml_file::Content::read(&mut model_reader, &Device::Cpu)?; let model = ModelWeights::from_ggml(model_content, 1)?; let tokenizer = tokenizers::Tokenizer::from_bytes(tokenizer).map_err(|m| anyhow::anyhow!(m))?; Ok(Self {