From 9ecc387c7811b53868b23026f1d6badb1ac58c24 Mon Sep 17 00:00:00 2001 From: Christoph Date: Mon, 22 Jan 2024 22:24:02 +0100 Subject: [PATCH] fixing missing argument and 'blck_size' --- orca-core/src/llm/quantized.rs | 8 ++++---- orca-models/src/mistral.rs | 7 ++++--- orca-models/src/quantized.rs | 5 +++-- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/orca-core/src/llm/quantized.rs b/orca-core/src/llm/quantized.rs index 3f75a58..6e8c909 100644 --- a/orca-core/src/llm/quantized.rs +++ b/orca-core/src/llm/quantized.rs @@ -198,7 +198,7 @@ impl Quantized { let mut total_size_in_bytes = 0; for (_, tensor) in model.tensor_infos.iter() { let elem_count = tensor.shape.elem_count(); - total_size_in_bytes += elem_count * tensor.ggml_dtype.type_size() / tensor.ggml_dtype.blck_size(); + total_size_in_bytes += elem_count * tensor.ggml_dtype.type_size() / tensor.ggml_dtype.block_size(); } log::info!( "loaded {:?} tensors ({}) in {:.2}s", @@ -206,14 +206,14 @@ impl Quantized { &format_size(total_size_in_bytes), start.elapsed().as_secs_f32(), ); - Some(ModelWeights::from_gguf(model, &mut file)?) + Some(ModelWeights::from_gguf(model, &mut file, &Device::Cpu)?) } Some("ggml" | "bin") | Some(_) | None => { - let model = ggml_file::Content::read(&mut file)?; + let model = ggml_file::Content::read(&mut file, &Device::Cpu)?; let mut total_size_in_bytes = 0; for (_, tensor) in model.tensors.iter() { let elem_count = tensor.shape().elem_count(); - total_size_in_bytes += elem_count * tensor.dtype().type_size() / tensor.dtype().blck_size(); + total_size_in_bytes += elem_count * tensor.dtype().type_size() / tensor.dtype().block_size(); } log::info!( "loaded {:?} tensors ({}) in {:.2}s", diff --git a/orca-models/src/mistral.rs b/orca-models/src/mistral.rs index a50a82e..14db6a7 100644 --- a/orca-models/src/mistral.rs +++ b/orca-models/src/mistral.rs @@ -1,3 +1,4 @@ +use candle::Device; use crate::utils::text_generation::{Model, TextGeneration}; use candle_transformers::models::mistral; use candle_transformers::models::quantized_mistral; @@ -72,7 +73,7 @@ impl Mistral { P: AsRef, { let cfg = mistral::Config::config_7b_v0_1(config.flash_attn); - let vb = candle_transformers::quantized_var_builder::VarBuilder::from_gguf(weights)?; + let vb = candle_transformers::quantized_var_builder::VarBuilder::from_gguf(weights, &Device::Cpu)?; let model = quantized_mistral::Model::new(&cfg, vb)?; let tokenizer = tokenizers::Tokenizer::from_file(tokenizer).map_err(|m| anyhow::anyhow!(m))?; Ok(Self { @@ -88,7 +89,7 @@ impl Mistral { pub fn from_stream(weights: Vec, tokenizer: Vec, config: Config) -> anyhow::Result { let cfg = mistral::Config::config_7b_v0_1(config.flash_attn); - let vb = candle_transformers::quantized_var_builder::VarBuilder::from_gguf_buffer(&weights)?; + let vb = candle_transformers::quantized_var_builder::VarBuilder::from_gguf_buffer(&weights, &Device::Cpu)?; let model = quantized_mistral::Model::new(&cfg, vb)?; let tokenizer = tokenizers::Tokenizer::from_bytes(tokenizer).map_err(|m| anyhow::anyhow!(m))?; Ok(Self { @@ -112,7 +113,7 @@ impl Mistral { )); let tokenizer = repo.get("tokenizer.json").await?; let model_path = repo.get("model-q4k.gguf").await?; - let vb = candle_transformers::quantized_var_builder::VarBuilder::from_gguf(model_path)?; + let vb = candle_transformers::quantized_var_builder::VarBuilder::from_gguf(model_path, &Device::Cpu)?; let model = quantized_mistral::Model::new(&mistral::Config::config_7b_v0_1(config.flash_attn), vb)?; let tokenizer = tokenizers::Tokenizer::from_file(tokenizer).map_err(anyhow::Error::msg)?; Ok(Self { diff --git a/orca-models/src/quantized.rs b/orca-models/src/quantized.rs index 08107e7..6ce1552 100644 --- a/orca-models/src/quantized.rs +++ b/orca-models/src/quantized.rs @@ -3,6 +3,7 @@ // #![allow(unused_variables)] // #![allow(unused_imports)] +use candle::Device; use candle::quantized::{ggml_file, gguf_file}; use candle_transformers::models::quantized_llama::ModelWeights; @@ -64,7 +65,7 @@ impl Quantized { pub fn from_gguf_stream(model: Vec, tokenizer: Vec, config: Config) -> anyhow::Result { let mut model_reader = std::io::Cursor::new(model); let model_content = gguf_file::Content::read(&mut model_reader)?; - let model = ModelWeights::from_gguf(model_content, &mut model_reader)?; + let model = ModelWeights::from_gguf(model_content, &mut model_reader, &Device::Cpu)?; let tokenizer = tokenizers::Tokenizer::from_bytes(tokenizer).map_err(|m| anyhow::anyhow!(m))?; Ok(Self { model, @@ -79,7 +80,7 @@ impl Quantized { pub fn from_ggml_stream(model: Vec, tokenizer: Vec, config: Config) -> anyhow::Result { let mut model_reader = std::io::Cursor::new(model); - let model_content = ggml_file::Content::read(&mut model_reader)?; + let model_content = ggml_file::Content::read(&mut model_reader, &Device::Cpu)?; let model = ModelWeights::from_ggml(model_content, 1)?; let tokenizer = tokenizers::Tokenizer::from_bytes(tokenizer).map_err(|m| anyhow::anyhow!(m))?; Ok(Self {