test link

intel · Jan 26, 2025 · 9f10db8 · 9f10db8
1 parent ee809e7
commit 9f10db8
Show file tree

Hide file tree

Showing 2 changed files with 151 additions and 0 deletions.
diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llamacpp-convert/CMakeLists.txt b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llamacpp-convert/CMakeLists.txt
@@ -0,0 +1,45 @@
+# set(TARGET convert-gguf-to-npu)
+# add_executable(${TARGET} convert-gguf-to-npu.cpp)
+# install(TARGETS ${TARGET} RUNTIME)
+# target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+# target_compile_features(${TARGET} PRIVATE cxx_std_17)
+
+cmake_minimum_required(VERSION 3.10)
+
+project(LLM_GGUF_TO_NPU VERSION 1.0.0 LANGUAGES CXX)
+
+set (CMAKE_CXX_STANDARD 17)
+SET (CMAKE_CXX_STANDARD_REQUIRED True)
+
+set(LIBRARY_DIR "D:\\yina\\llamacpplibs")
+include_directories(${LIBRARY_DIR}/include)
+
+add_library(llama STATIC IMPORTED)
+set_target_properties(llama PROPERTIES IMPORTED_LOCATION ${LIBRARY_DIR}/llama.lib)
+
+add_library(common STATIC IMPORTED)
+set_target_properties(common PROPERTIES IMPORTED_LOCATION ${LIBRARY_DIR}/common.lib)
+
+add_library(ggml STATIC IMPORTED)
+set_target_properties(ggml PROPERTIES IMPORTED_LOCATION ${LIBRARY_DIR}/ggml.lib)
+
+
+set(TARGET convert-gguf-to-npu)
+add_executable(${TARGET} convert-gguf-to-npu.cpp)
+target_link_libraries(${TARGET} PRIVATE common llama ggml ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
+
+add_custom_command(TARGET convert-gguf-to-npu POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy_if_different
+        ${LIBRARY_DIR}/llama.dll
+        ${LIBRARY_DIR}/ggml.dll
+        ${CMAKE_BINARY_DIR}/Release/
+    COMMENT "Copying npu_llm.dll to build/Release\n"
+)
+
+# add_custom_command(TARGET llama-cli-npu POST_BUILD
+#     COMMAND ${CMAKE_COMMAND} -E copy_directory
+#         ${DLL_DIR}/
+#         ${CMAKE_BINARY_DIR}/Release/
+#     COMMENT "Copying dependency to build/Release\n"
+# )
diff --git a/...n/llm/example/NPU/HF-Transformers-AutoModels/LLM/llamacpp-convert/convert-gguf-to-npu.cpp b/...n/llm/example/NPU/HF-Transformers-AutoModels/LLM/llamacpp-convert/convert-gguf-to-npu.cpp
@@ -0,0 +1,106 @@
+#include "arg.h"
+#include "common.h"
+#include "log.h"
+#include "llama.h"
+#include <filesystem>
+#include <vector>
+#include<iostream>
+
+#ifdef _WIN32
+#define PATH_SEP '\\'
+#else
+#define PATH_SEP '/'
+#endif
+
+static void print_usage(int, char ** argv) {
+    LOG("\nexample usage:\n");
+    LOG("\n    %s -m model.gguf -o output_dir --low-bit sym_int4 --quantization-group-size 0\n", argv[0]);
+    LOG("\n");
+}
+
+int main(int argc, char ** argv) {
+    gpt_params params;
+
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_NPU, print_usage)) {
+        return 1;
+    }
+
+    gpt_init();
+
+    // init LLM
+
+    llama_backend_init();
+    llama_numa_init(params.numa);
+
+    enum gguf_npu_qtype type;
+
+    if (params.low_bit == "sym_int4") {
+        type = GGUF_TYPE_NPU_CW_Q4_0;
+    } else if (params.low_bit == "asym_int4") {
+        type = GGUF_TYPE_NPU_CW_Q4_1;
+    } else {
+        std::cerr << "\033[31m" << __func__ << ": error: Only support sym_int4 and asym_int4 but got " << params.low_bit << "\033[0m\n" << std::endl;
+        exit(1);
+    }
+
+    if (params.npu_outfile == "NPU_MODEL") {
+        fprintf(stderr , "\033[31m%s: error: Please provide npu model output dir with -o <output_dir>\033[0m\n" , __func__);
+        exit(1);
+    }
+
+    // initialize the model
+
+    llama_model_params model_params = llama_model_params_from_gpt_params(params);
+
+    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
+
+    if (model == NULL) {
+        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
+        return 1;
+    }
+
+    // initialize the context
+
+    llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
+
+    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
+
+    if (ctx == NULL) {
+        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
+        return 1;
+    }
+
+    std::string output_dir = params.npu_outfile;
+    std::filesystem::path dirPath = output_dir;
+
+    // handle weight first
+    if(std::filesystem::create_directory(dirPath)) {
+        std::cout << "Directory created: " << dirPath << std::endl;
+    } else {
+        std::cout << "Failed to create directory or already exists: " << dirPath << "\n";
+    }
+
+    std::string weight_path = output_dir + PATH_SEP + "model_weights"; // TODO: optimize /
+    dirPath = weight_path;
+    if(std::filesystem::create_directory(dirPath)) {
+        std::cout << "Directory created: " << dirPath << std::endl;
+    } else {
+        std::cout << "Failed to create directory or already exists: " << dirPath << "\n";
+    }
+
+    if (params.quantization_group_size != 0) {
+        std::cerr << "\033[31mOnly support quantization group_size=0, fall back to channel wise quantization.\033[0m\n" << std::endl;
+    }
+
+    std::cout << "\033[32mConverting GGUF model to " <<  params.low_bit << " NPU model...\033[0m" << std::endl;
+    convert_gguf_to_npu_weight(model, weight_path.c_str(), type);
+
+    std::cout << "\033[32mModel weights saved to " << weight_path << "\033[0m"<< std::endl;
+
+    llama_free(ctx);
+    llama_free_model(model);
+
+    llama_backend_free();
+
+    return 0;
+}