-
Notifications
You must be signed in to change notification settings - Fork 1.3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
2 changed files
with
151 additions
and
0 deletions.
There are no files selected for viewing
45 changes: 45 additions & 0 deletions
45
python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llamacpp-convert/CMakeLists.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
# set(TARGET convert-gguf-to-npu) | ||
# add_executable(${TARGET} convert-gguf-to-npu.cpp) | ||
# install(TARGETS ${TARGET} RUNTIME) | ||
# target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | ||
# target_compile_features(${TARGET} PRIVATE cxx_std_17) | ||
|
||
cmake_minimum_required(VERSION 3.10) | ||
|
||
project(LLM_GGUF_TO_NPU VERSION 1.0.0 LANGUAGES CXX) | ||
|
||
set (CMAKE_CXX_STANDARD 17) | ||
SET (CMAKE_CXX_STANDARD_REQUIRED True) | ||
|
||
set(LIBRARY_DIR "D:\\yina\\llamacpplibs") | ||
include_directories(${LIBRARY_DIR}/include) | ||
|
||
add_library(llama STATIC IMPORTED) | ||
set_target_properties(llama PROPERTIES IMPORTED_LOCATION ${LIBRARY_DIR}/llama.lib) | ||
|
||
add_library(common STATIC IMPORTED) | ||
set_target_properties(common PROPERTIES IMPORTED_LOCATION ${LIBRARY_DIR}/common.lib) | ||
|
||
add_library(ggml STATIC IMPORTED) | ||
set_target_properties(ggml PROPERTIES IMPORTED_LOCATION ${LIBRARY_DIR}/ggml.lib) | ||
|
||
|
||
set(TARGET convert-gguf-to-npu) | ||
add_executable(${TARGET} convert-gguf-to-npu.cpp) | ||
target_link_libraries(${TARGET} PRIVATE common llama ggml ${CMAKE_THREAD_LIBS_INIT}) | ||
target_compile_features(${TARGET} PRIVATE cxx_std_17) | ||
|
||
add_custom_command(TARGET convert-gguf-to-npu POST_BUILD | ||
COMMAND ${CMAKE_COMMAND} -E copy_if_different | ||
${LIBRARY_DIR}/llama.dll | ||
${LIBRARY_DIR}/ggml.dll | ||
${CMAKE_BINARY_DIR}/Release/ | ||
COMMENT "Copying npu_llm.dll to build/Release\n" | ||
) | ||
|
||
# add_custom_command(TARGET llama-cli-npu POST_BUILD | ||
# COMMAND ${CMAKE_COMMAND} -E copy_directory | ||
# ${DLL_DIR}/ | ||
# ${CMAKE_BINARY_DIR}/Release/ | ||
# COMMENT "Copying dependency to build/Release\n" | ||
# ) |
106 changes: 106 additions & 0 deletions
106
...n/llm/example/NPU/HF-Transformers-AutoModels/LLM/llamacpp-convert/convert-gguf-to-npu.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
#include "arg.h" | ||
#include "common.h" | ||
#include "log.h" | ||
#include "llama.h" | ||
#include <filesystem> | ||
#include <vector> | ||
#include<iostream> | ||
|
||
#ifdef _WIN32 | ||
#define PATH_SEP '\\' | ||
#else | ||
#define PATH_SEP '/' | ||
#endif | ||
|
||
static void print_usage(int, char ** argv) { | ||
LOG("\nexample usage:\n"); | ||
LOG("\n %s -m model.gguf -o output_dir --low-bit sym_int4 --quantization-group-size 0\n", argv[0]); | ||
LOG("\n"); | ||
} | ||
|
||
int main(int argc, char ** argv) { | ||
gpt_params params; | ||
|
||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_NPU, print_usage)) { | ||
return 1; | ||
} | ||
|
||
gpt_init(); | ||
|
||
// init LLM | ||
|
||
llama_backend_init(); | ||
llama_numa_init(params.numa); | ||
|
||
enum gguf_npu_qtype type; | ||
|
||
if (params.low_bit == "sym_int4") { | ||
type = GGUF_TYPE_NPU_CW_Q4_0; | ||
} else if (params.low_bit == "asym_int4") { | ||
type = GGUF_TYPE_NPU_CW_Q4_1; | ||
} else { | ||
std::cerr << "\033[31m" << __func__ << ": error: Only support sym_int4 and asym_int4 but got " << params.low_bit << "\033[0m\n" << std::endl; | ||
exit(1); | ||
} | ||
|
||
if (params.npu_outfile == "NPU_MODEL") { | ||
fprintf(stderr , "\033[31m%s: error: Please provide npu model output dir with -o <output_dir>\033[0m\n" , __func__); | ||
exit(1); | ||
} | ||
|
||
// initialize the model | ||
|
||
llama_model_params model_params = llama_model_params_from_gpt_params(params); | ||
|
||
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); | ||
|
||
if (model == NULL) { | ||
fprintf(stderr , "%s: error: unable to load model\n" , __func__); | ||
return 1; | ||
} | ||
|
||
// initialize the context | ||
|
||
llama_context_params ctx_params = llama_context_params_from_gpt_params(params); | ||
|
||
llama_context * ctx = llama_new_context_with_model(model, ctx_params); | ||
|
||
if (ctx == NULL) { | ||
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__); | ||
return 1; | ||
} | ||
|
||
std::string output_dir = params.npu_outfile; | ||
std::filesystem::path dirPath = output_dir; | ||
|
||
// handle weight first | ||
if(std::filesystem::create_directory(dirPath)) { | ||
std::cout << "Directory created: " << dirPath << std::endl; | ||
} else { | ||
std::cout << "Failed to create directory or already exists: " << dirPath << "\n"; | ||
} | ||
|
||
std::string weight_path = output_dir + PATH_SEP + "model_weights"; // TODO: optimize / | ||
dirPath = weight_path; | ||
if(std::filesystem::create_directory(dirPath)) { | ||
std::cout << "Directory created: " << dirPath << std::endl; | ||
} else { | ||
std::cout << "Failed to create directory or already exists: " << dirPath << "\n"; | ||
} | ||
|
||
if (params.quantization_group_size != 0) { | ||
std::cerr << "\033[31mOnly support quantization group_size=0, fall back to channel wise quantization.\033[0m\n" << std::endl; | ||
} | ||
|
||
std::cout << "\033[32mConverting GGUF model to " << params.low_bit << " NPU model...\033[0m" << std::endl; | ||
convert_gguf_to_npu_weight(model, weight_path.c_str(), type); | ||
|
||
std::cout << "\033[32mModel weights saved to " << weight_path << "\033[0m"<< std::endl; | ||
|
||
llama_free(ctx); | ||
llama_free_model(model); | ||
|
||
llama_backend_free(); | ||
|
||
return 0; | ||
} |