diff --git a/wasmedge-ggml-codellama/Cargo.toml b/wasmedge-ggml-codellama/Cargo.toml new file mode 100644 index 0000000..f96c9d1 --- /dev/null +++ b/wasmedge-ggml-codellama/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "wasmedge-ggml-codellama" +version = "0.1.0" +edition = "2021" + +[dependencies] +chat-prompts = "0.1" +endpoints = "0.1" +wasi-nn = { git = "https://github.com/second-state/wasmedge-wasi-nn", branch = "ggml" } +clap = "4.4.6" +once_cell = "1.18" diff --git a/wasmedge-ggml-codellama/README.md b/wasmedge-ggml-codellama/README.md new file mode 100644 index 0000000..94bca08 --- /dev/null +++ b/wasmedge-ggml-codellama/README.md @@ -0,0 +1,174 @@ +# Chat with `CodeLlama-13B-Instruct` using WASI-NN with GGML Backend + +## Requirement + +### For macOS (apple silicon) + +Install WasmEdge 0.13.4+WASI-NN ggml plugin(Metal enabled on apple silicon) via installer + +```bash +curl -sSf https://raw.githubusercontent.com/WasmEdge/WasmEdge/master/utils/install.sh | bash -s -- --plugin wasi_nn-ggml +# After install the wasmedge, you have to activate the environment. +# Assuming you use zsh (the default shell on macOS), you will need to run the following command +source $HOME/.zshenv +``` + +### For Ubuntu (>= 20.04) + +Because we enabled OpenBLAS on Ubuntu, you must install `libopenblas-dev` by `apt update && apt install -y libopenblas-dev`. + +Install WasmEdge 0.13.4+WASI-NN ggml plugin(OpenBLAS enabled) via installer + +```bash +apt update && apt install -y libopenblas-dev # You may need sudo if the user is not root. +curl -sSf https://raw.githubusercontent.com/WasmEdge/WasmEdge/master/utils/install.sh | bash -s -- --plugin wasi_nn-ggml +# After install the wasmedge, you have to activate the environment. +# Assuming you use bash (the default shell on Ubuntu), you will need to run the following command +source $HOME/.bashrc +``` + +### For General Linux + +Install WasmEdge 0.13.4+WASI-NN ggml plugin via installer + +```bash +curl -sSf https://raw.githubusercontent.com/WasmEdge/WasmEdge/master/utils/install.sh | bash -s -- --plugin wasi_nn-ggml +# After install the wasmedge, you have to activate the environment. +# Assuming you use bash (the default shell on Ubuntu), you will need to run the following command +source $HOME/.bashrc +``` + +## Prepare WASM application + +### (Recommend) Use the pre-built one bundled in this repo + +We built a wasm of this example under the folder, check `wasmedge-ggml-codellama.wasm` + +### (Optional) Build from source + +If you want to do some modifications, you can build from source. + +Compile the application to WebAssembly: + +```bash +cargo build --target wasm32-wasi --release +``` + +The output WASM file will be at `target/wasm32-wasi/release/`. + +```bash +cp target/wasm32-wasi/release/wasmedge-ggml-codellama.wasm ./wasmedge-ggml-codellama.wasm +``` + +## Get Model + +In this example, we are going to use `codellama-13b-instruct.Q4_0.gguf`. + +Download llama model: + +```bash +curl -LO https://huggingface.co/TheBloke/CodeLlama-13B-Instruct-GGUF/resolve/main/codellama-13b-instruct.Q4_0.gguf +``` + +## Execute + +Execute the WASM with the `wasmedge` using the named model feature to preload large model: + +```bash +wasmedge --dir .:. \ + --nn-preload default:GGML:CPU:codellama-13b-instruct.Q4_0.gguf \ + wasmedge-ggml-codellama.wasm --model-alias default +``` + +After executing the command, you may need to wait a moment for the input prompt to appear. +You can enter your question once you see the `[USER]:` prompt: + +~~~console +[USER]: +convert a String into a std::ffi::CString in Rust +[ASSISTANT]: +In Rust, you can convert a `String` into a `std::ffi::CString` using the `to_cstring` method. Here's an example: +``` +use std::ffi::CString; + +let s = "Hello, world!"; +let c_string = s.to_cstring(); +``` +This will create a `CString` from the `String` `s` and store it in the `c_string` variable. + +Alternatively, you can use the `CString::new` method to create a `CString` from a `String` directly: +``` +use std::ffi::CString; + +let s = "Hello, world!"; +let c_string = CString::new(s); +``` +This will create a `CString` from the `String` `s` and store it in the `c_string` variable. + +Note that the `to_cstring` method and the `CString::new` method both return a `Result` type, which indicates whether the conversion was successful or not. If the conversion fails, the `Result` will contain an error message. +[USER]: +write a hello-world program in Python +[ASSISTANT]: +Sure! Here is a simple "Hello, World!" program in Python: +``` +print("Hello, World!") +``` +This program will print the string "Hello, World!" to the console. + +Alternatively, you can also use the `print()` function with parentheses to print the string: +``` +print("Hello, World!") +``` +This will also print the string "Hello, World!" to the console. + +I hope this helps! Let me know if you have any questions. +~~~ + +## Errors + +- After running `apt update && apt install -y libopenblas-dev`, you may encountered the following error: + + ```bash + ... + E: Could not open lock file /var/lib/dpkg/lock-frontend - open (13: Permission denied) + E: Unable to acquire the dpkg frontend lock (/var/lib/dpkg/lock-frontend), are you root? + ``` + + This indicates that you are not logged in as `root`. Please try installing again using the `sudo` command: + + ```bash + sudo apt update && sudo apt install -y libopenblas-dev + ``` + +- After running the `wasmedge` command, you may received the following error: + + ```bash + [2023-10-02 14:30:31.227] [error] loading failed: invalid path, Code: 0x20 + [2023-10-02 14:30:31.227] [error] load library failed:libblas.so.3: cannot open shared object file: No such file or directory + [2023-10-02 14:30:31.227] [error] loading failed: invalid path, Code: 0x20 + [2023-10-02 14:30:31.227] [error] load library failed:libblas.so.3: cannot open shared object file: No such file or directory + unknown option: nn-preload + ``` + + This suggests that your plugin installation was not successful. To resolve this issue, please attempt to install your desired plugin again. + +## Parameters + +Currently, we support the following parameters: + +- `LLAMA_LOG`: Set it to a non-empty value to enable logging. +- `LLAMA_N_CTX`: Set the context size, the same as the `--ctx-size` parameter in llama.cpp (default: 512). +- `LLAMA_N_PREDICT`: Set the number of tokens to predict, the same as the `--n-predict` parameter in llama.cpp (default: 512). + +These parameters can be set by adding the following environment variables before the `wasmedge` command: + +```bash +LLAMA_LOG=1 LLAMA_N_CTX=2048 LLAMA_N_PREDICT=512 \ +wasmedge --dir .:. \ + --nn-preload default:GGML:CPU:codellama-13b-instruct.Q4_0.gguf \ + wasmedge-ggml-codellama.wasm --model-alias default --ctx-size 2048 +``` + +## Credit + +The WASI-NN ggml plugin embedded [`llama.cpp`](git://github.com/ggerganov/llama.cpp.git@b1217) as its backend. diff --git a/wasmedge-ggml-codellama/src/main.rs b/wasmedge-ggml-codellama/src/main.rs new file mode 100644 index 0000000..613266e --- /dev/null +++ b/wasmedge-ggml-codellama/src/main.rs @@ -0,0 +1,155 @@ +use chat_prompts::chat::{llama::CodeLlamaInstructPrompt, BuildChatPrompt, ChatPrompt}; +use clap::{Arg, Command}; +use endpoints::chat::{ChatCompletionRequest, ChatCompletionRequestMessage, ChatCompletionRole}; +use once_cell::sync::OnceCell; + +const DEFAULT_CTX_SIZE: &str = "2048"; +static CTX_SIZE: OnceCell = OnceCell::new(); + +#[allow(unreachable_code)] +fn main() -> Result<(), String> { + let matches = Command::new("Llama API Server") + .arg( + Arg::new("model_alias") + .short('m') + .long("model-alias") + .value_name("ALIAS") + .help("Sets the model alias") + .required(true), + ) + .arg( + Arg::new("ctx_size") + .short('c') + .long("ctx-size") + .value_parser(clap::value_parser!(u32)) + .value_name("CTX_SIZE") + .help("Sets the prompt context size") + .default_value(DEFAULT_CTX_SIZE), + ) + .get_matches(); + + // model alias + let model_name = matches + .get_one::("model_alias") + .unwrap() + .to_string(); + println!("[INFO] Model alias: {alias}", alias = &model_name); + + // prompt context size + let ctx_size = matches.get_one::("ctx_size").unwrap(); + if CTX_SIZE.set(*ctx_size as usize).is_err() { + return Err(String::from("Fail to parse prompt context size")); + } + println!("[INFO] Prompt context size: {size}", size = ctx_size); + + let template = ChatPrompt::CodeLlamaInstructPrompt(CodeLlamaInstructPrompt::default()); + + let mut chat_request = ChatCompletionRequest::default(); + + // load the model into wasi-nn + let graph = match wasi_nn::GraphBuilder::new( + wasi_nn::GraphEncoding::Ggml, + wasi_nn::ExecutionTarget::CPU, + ) + .build_from_cache(model_name.as_ref()) + { + Ok(graph) => graph, + Err(e) => { + return Err(format!( + "Fail to load model into wasi-nn: {msg}", + msg = e.to_string() + )) + } + }; + + // initialize the execution context + let mut context = match graph.init_execution_context() { + Ok(context) => context, + Err(e) => { + return Err(format!( + "Fail to create wasi-nn execution context: {msg}", + msg = e.to_string() + )) + } + }; + + print_separator(); + + loop { + println!("[USER]:"); + let user_message = read_input(); + chat_request + .messages + .push(ChatCompletionRequestMessage::new( + ChatCompletionRole::User, + user_message, + )); + + // build prompt + let prompt = match template.build(&mut chat_request.messages) { + Ok(prompt) => prompt, + Err(e) => { + return Err(format!( + "Fail to build chat prompts: {msg}", + msg = e.to_string() + )) + } + }; + + // read input tensor + let tensor_data = prompt.as_bytes().to_vec(); + if context + .set_input(0, wasi_nn::TensorType::U8, &[1], &tensor_data) + .is_err() + { + return Err(String::from("Fail to set input tensor")); + }; + + // execute the inference + if context.compute().is_err() { + return Err(String::from("Fail to execute model inference")); + } + + // retrieve the output + let mut output_buffer = vec![0u8; *CTX_SIZE.get().unwrap()]; + let mut output_size = match context.get_output(0, &mut output_buffer) { + Ok(size) => size, + Err(e) => { + return Err(format!( + "Fail to get output tensor: {msg}", + msg = e.to_string() + )) + } + }; + output_size = std::cmp::min(*CTX_SIZE.get().unwrap(), output_size); + let output = String::from_utf8_lossy(&output_buffer[..output_size]).to_string(); + println!("[ASSISTANT]:\n{}", output.trim()); + + // put the answer into the `messages` of chat_request + chat_request + .messages + .push(ChatCompletionRequestMessage::new( + ChatCompletionRole::Assistant, + output, + )); + } + + Ok(()) +} + +fn read_input() -> String { + loop { + let mut answer = String::new(); + std::io::stdin() + .read_line(&mut answer) + .ok() + .expect("Failed to read line"); + if !answer.is_empty() && answer != "\n" && answer != "\r\n" { + return answer; + } + } +} + +fn print_separator() { + println!("---------------------------------------"); +} diff --git a/wasmedge-ggml-codellama/wasmedge-ggml-codellama.wasm b/wasmedge-ggml-codellama/wasmedge-ggml-codellama.wasm new file mode 100755 index 0000000..81c4bc9 Binary files /dev/null and b/wasmedge-ggml-codellama/wasmedge-ggml-codellama.wasm differ