From f71b38a99467fb12a5b99449cc75b438613ce600 Mon Sep 17 00:00:00 2001 From: Yuwen Hu <54161268+Oscilloscope98@users.noreply.github.com> Date: Thu, 26 Sep 2024 17:40:22 +0800 Subject: [PATCH] Update MiniCPM_V_26 GPU example with save & load (#12127) --- .../Multimodal/MiniCPM-V-2_6/README.md | 9 +++ .../Multimodal/MiniCPM-V-2_6/chat.py | 56 ++++++++++++++----- 2 files changed, 51 insertions(+), 14 deletions(-) diff --git a/python/llm/example/GPU/HuggingFace/Multimodal/MiniCPM-V-2_6/README.md b/python/llm/example/GPU/HuggingFace/Multimodal/MiniCPM-V-2_6/README.md index 6063a286b4a..569225f6503 100644 --- a/python/llm/example/GPU/HuggingFace/Multimodal/MiniCPM-V-2_6/README.md +++ b/python/llm/example/GPU/HuggingFace/Multimodal/MiniCPM-V-2_6/README.md @@ -114,12 +114,21 @@ set SYCL_CACHE_PERSISTENT=1 ``` python ./chat.py --prompt 'What is in the image?' --stream ``` +- save model with low-bit optimization (if `LOWBIT_MODEL_PATH` does not exist) + ``` + python ./chat.py --repo-id-or-model-path REPO_ID_OR_MODEL_PATH --lowbit-path LOWBIT_MODEL_PATH --prompt 'What is in the image?' + ``` +- chat with saved model with low-bit optimization (if `LOWBIT_MODEL_PATH` exists): + ``` + python ./chat.py --lowbit-path LOWBIT_MODEL_PATH --prompt 'What is in the image?' + ``` > [!TIP] > For chatting in streaming mode, it is recommended to set the environment variable `PYTHONUNBUFFERED=1`. Arguments info: - `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the MiniCPM-V-2_6 (e.g. `openbmb/MiniCPM-V-2_6`) to be downloaded, or the path to the huggingface checkpoint folder. It is default to be `'openbmb/MiniCPM-V-2_6'`. +- `--lowbit-path LOWBIT_MODEL_PATH`: argument defining the path to save/load the model with IPEX-LLM low-bit optimization. If it is an empty string, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded. If it is an existing path, the saved model with low-bit optimization in `LOWBIT_MODEL_PATH` will be loaded. If it is a non-existing path, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded, and the optimized low-bit model will be saved into `LOWBIT_MODEL_PATH`. It is default to be `''`, i.e. an empty string. - `--image-url-or-path IMAGE_URL_OR_PATH`: argument defining the image to be infered. It is default to be `'http://farm6.staticflickr.com/5268/5602445367_3504763978_z.jpg'`. - `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `'What is in the image?'`. - `--stream`: flag to chat in streaming mode diff --git a/python/llm/example/GPU/HuggingFace/Multimodal/MiniCPM-V-2_6/chat.py b/python/llm/example/GPU/HuggingFace/Multimodal/MiniCPM-V-2_6/chat.py index a698cd9d457..cad68239fd5 100644 --- a/python/llm/example/GPU/HuggingFace/Multimodal/MiniCPM-V-2_6/chat.py +++ b/python/llm/example/GPU/HuggingFace/Multimodal/MiniCPM-V-2_6/chat.py @@ -22,7 +22,7 @@ import torch from PIL import Image from ipex_llm.transformers import AutoModel -from transformers import AutoTokenizer +from transformers import AutoTokenizer, AutoProcessor if __name__ == '__main__': @@ -30,6 +30,13 @@ parser.add_argument('--repo-id-or-model-path', type=str, default="openbmb/MiniCPM-V-2_6", help='The huggingface repo id for the openbmb/MiniCPM-V-2_6 model to be downloaded' ', or the path to the huggingface checkpoint folder') + parser.add_argument("--lowbit-path", type=str, + default="", + help="The path to the saved model folder with IPEX-LLM low-bit optimization. " + "Leave it blank if you want to load from the original model. " + "If the path does not exist, model with low-bit optimization will be saved there." + "Otherwise, model with low-bit optimization will be loaded from the path.", + ) parser.add_argument('--image-url-or-path', type=str, default='http://farm6.staticflickr.com/5268/5602445367_3504763978_z.jpg', help='The URL or path to the image to infer') @@ -41,22 +48,43 @@ args = parser.parse_args() model_path = args.repo_id_or_model_path image_path = args.image_url_or_path + + lowbit_path = args.lowbit_path + + if not lowbit_path or not os.path.exists(lowbit_path): + # Load model in 4 bit, + # which convert the relevant layers in the model into INT4 format + # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the from_pretrained function. + # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU. + model = AutoModel.from_pretrained(model_path, + load_in_low_bit="sym_int4", + optimize_model=True, + trust_remote_code=True, + use_cache=True, + modules_to_not_convert=["vpm", "resampler"]) + + tokenizer = AutoTokenizer.from_pretrained(model_path, + trust_remote_code=True) + else: + model = AutoModel.load_low_bit(lowbit_path, + optimize_model=True, + trust_remote_code=True, + use_cache=True, + modules_to_not_convert=["vpm", "resampler"]) + tokenizer = AutoTokenizer.from_pretrained(lowbit_path, + trust_remote_code=True) - # Load model in 4 bit, - # which convert the relevant layers in the model into INT4 format - # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the from_pretrained function. - # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU. - model = AutoModel.from_pretrained(model_path, - load_in_low_bit="sym_int4", - optimize_model=True, - trust_remote_code=True, - use_cache=True, - modules_to_not_convert=["vpm", "resampler"]) - model = model.half().to('xpu') - tokenizer = AutoTokenizer.from_pretrained(model_path, - trust_remote_code=True) model.eval() + if lowbit_path and not os.path.exists(lowbit_path): + processor = AutoProcessor.from_pretrained(model_path, + trust_remote_code=True) + model.save_low_bit(lowbit_path) + tokenizer.save_pretrained(lowbit_path) + processor.save_pretrained(lowbit_path) + + model = model.half().to('xpu') + query = args.prompt if os.path.exists(image_path): image = Image.open(image_path).convert('RGB')