diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/README.md b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/README.md index 98f2c161070..7298d570ac3 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/README.md +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/README.md @@ -51,9 +51,12 @@ python baichuan2.py Arguments info: - `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the model (e.g. `meta-llama/Llama-2-7b-chat-hf`) to be downloaded, or the path to the huggingface checkpoint folder. +- `--lowbit-path LOWBIT_MODEL_PATH`: argument defining the path to save/load lowbit version of the model. If it is an empty string, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded. If it is an existing path, the lowbit model in `LOWBIT_MODEL_PATH` will be loaded. If it is a non-existing path, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded, and the converted lowbit version will be saved into `LOWBIT_MODEL_PATH`. It is default to be `''`, i.e. an empty string. - `--prompt PROMPT`: argument defining the prompt to be infered. It is default to be `What is AI?`. - `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`. -- `--max-output-len MAX_OUTPUT_LEN`: Defines the maximum sequence length for both input and output tokens. It is default to be `1024`. +- `--max-context-len MAX_CONTEXT_LEN`: Defines the maximum sequence length for both input and output tokens. It is default to be `1024`. +- `--max-prompt-len MAX_PROMPT_LEN`: Defines the maximum number of tokens that the input prompt can contain. It is default to be `512`. +- `--disable-transpose-value-cache`: Disable the optimization of transposing value cache. ### Sample Output #### [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/baichuan2.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/baichuan2.py index 04e4a0ff8b6..3217b1a4681 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/baichuan2.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/baichuan2.py @@ -15,6 +15,7 @@ # +import os import torch import time import argparse @@ -48,28 +49,49 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]], help="The huggingface repo id for the Baichuan2 model to be downloaded" ", or the path to the huggingface checkpoint folder", ) + parser.add_argument("--lowbit-path", type=str, + default="", + help="The path to the lowbit model folder, leave blank if you do not want to save. \ + If path not exists, lowbit model will be saved there. \ + Else, lowbit model will be loaded.", + ) parser.add_argument('--prompt', type=str, default="What is AI?", help='Prompt to infer') parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict") parser.add_argument("--max-context-len", type=int, default=1024) - parser.add_argument("--max-prompt-len", type=int, default=960) + parser.add_argument("--max-prompt-len", type=int, default=512) parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False) args = parser.parse_args() model_path = args.repo_id_or_model_path - model = AutoModelForCausalLM.from_pretrained(model_path, - optimize_model=True, - pipeline=True, - max_context_len=args.max_context_len, - max_prompt_len=args.max_prompt_len, - torch_dtype=torch.float16, - attn_implementation="eager", - transpose_value_cache=not args.disable_transpose_value_cache, - trust_remote_code=True) + if not args.lowbit_path or not os.path.exists(args.lowbit_path): + model = AutoModelForCausalLM.from_pretrained(model_path, + optimize_model=True, + pipeline=True, + max_context_len=args.max_context_len, + max_prompt_len=args.max_prompt_len, + torch_dtype=torch.float16, + attn_implementation="eager", + transpose_value_cache=not args.disable_transpose_value_cache, + trust_remote_code=True) + else: + model = AutoModelForCausalLM.load_low_bit( + args.lowbit_path, + attn_implementation="eager", + torch_dtype=torch.float16, + max_context_len=args.max_context_len, + max_prompt_len=args.max_prompt_len, + pipeline=True, + transpose_value_cache=not args.disable_transpose_value_cache, + trust_remote_code=True + ) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + if args.lowbit_path and not os.path.exists(args.lowbit_path): + model.save_low_bit(args.lowbit_path) + DEFAULT_SYSTEM_PROMPT = """\ """ diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama2.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama2.py index 2d43c8ca5fc..6f36051f36b 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama2.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama2.py @@ -15,6 +15,7 @@ # +import os import torch import time import argparse @@ -48,29 +49,49 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]], help="The huggingface repo id for the Llama2 model to be downloaded" ", or the path to the huggingface checkpoint folder", ) + parser.add_argument("--lowbit-path", type=str, + default="", + help="The path to the lowbit model folder, leave blank if you do not want to save. \ + If path not exists, lowbit model will be saved there. \ + Else, lowbit model will be loaded.", + ) parser.add_argument('--prompt', type=str, default="What is AI?", help='Prompt to infer') parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict") parser.add_argument("--max-context-len", type=int, default=1024) + parser.add_argument("--max-prompt-len", type=int, default=512) parser.add_argument("--quantization_group_size", type=int, default=0) - parser.add_argument("--max-prompt-len", type=int, default=960) parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False) args = parser.parse_args() model_path = args.repo_id_or_model_path - model = AutoModelForCausalLM.from_pretrained(model_path, - optimize_model=True, - pipeline=True, - max_context_len=args.max_context_len, - max_prompt_len=args.max_prompt_len, - quantization_group_size=args.quantization_group_size, - torch_dtype=torch.float16, - attn_implementation="eager", - transpose_value_cache=not args.disable_transpose_value_cache) + if not args.lowbit_path or not os.path.exists(args.lowbit_path): + model = AutoModelForCausalLM.from_pretrained(model_path, + optimize_model=True, + pipeline=True, + max_context_len=args.max_context_len, + max_prompt_len=args.max_prompt_len, + quantization_group_size=args.quantization_group_size, + torch_dtype=torch.float16, + attn_implementation="eager", + transpose_value_cache=not args.disable_transpose_value_cache) + else: + model = AutoModelForCausalLM.load_low_bit( + args.lowbit_path, + attn_implementation="eager", + torch_dtype=torch.float16, + max_context_len=args.max_context_len, + max_prompt_len=args.max_prompt_len, + pipeline=True, + transpose_value_cache=not args.disable_transpose_value_cache, + ) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + if args.lowbit_path and not os.path.exists(args.lowbit_path): + model.save_low_bit(args.lowbit_path) + DEFAULT_SYSTEM_PROMPT = """\ """ diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama3.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama3.py index 377cc17c339..eed246d439d 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama3.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama3.py @@ -15,6 +15,7 @@ # +import os import torch import time import argparse @@ -54,29 +55,49 @@ def get_prompt(user_input: str, chat_history: list[tuple[str, str]], help="The huggingface repo id for the Llama3 model to be downloaded" ", or the path to the huggingface checkpoint folder", ) + parser.add_argument("--lowbit-path", type=str, + default="", + help="The path to the lowbit model folder, leave blank if you do not want to save. \ + If path not exists, lowbit model will be saved there. \ + Else, lowbit model will be loaded.", + ) parser.add_argument('--prompt', type=str, default="What is AI?", help='Prompt to infer') parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict") parser.add_argument("--max-context-len", type=int, default=1024) - parser.add_argument("--max-prompt-len", type=int, default=960) + parser.add_argument("--max-prompt-len", type=int, default=512) parser.add_argument("--quantization_group_size", type=int, default=0) parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False) args = parser.parse_args() model_path = args.repo_id_or_model_path - model = AutoModelForCausalLM.from_pretrained(model_path, - torch_dtype=torch.float16, - optimize_model=True, - pipeline=True, - max_context_len=args.max_context_len, - max_prompt_len=args.max_prompt_len, - quantization_group_size=args.quantization_group_size, - attn_implementation="eager", - transpose_value_cache=not args.disable_transpose_value_cache) + if not args.lowbit_path or not os.path.exists(args.lowbit_path): + model = AutoModelForCausalLM.from_pretrained(model_path, + torch_dtype=torch.float16, + optimize_model=True, + pipeline=True, + max_context_len=args.max_context_len, + max_prompt_len=args.max_prompt_len, + quantization_group_size=args.quantization_group_size, + attn_implementation="eager", + transpose_value_cache=not args.disable_transpose_value_cache) + else: + model = AutoModelForCausalLM.load_low_bit( + args.lowbit_path, + attn_implementation="eager", + torch_dtype=torch.float16, + max_context_len=args.max_context_len, + max_prompt_len=args.max_prompt_len, + pipeline=True, + transpose_value_cache=not args.disable_transpose_value_cache, + ) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + if args.lowbit_path and not os.path.exists(args.lowbit_path): + model.save_low_bit(args.lowbit_path) + print("-" * 80) print("done") with torch.inference_mode(): diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md index 27a405b0719..ceae4e9a0b4 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md @@ -127,7 +127,7 @@ Arguments info: - `--lowbit-path LOWBIT_MODEL_PATH`: argument defining the path to save/load lowbit version of the model. If it is an empty string, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded. If it is an existing path, the lowbit model in `LOWBIT_MODEL_PATH` will be loaded. If it is a non-existing path, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded, and the converted lowbit version will be saved into `LOWBIT_MODEL_PATH`. It is default to be `''`, i.e. an empty string. - `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `What is AI?`. - `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`. -- `--max-output-len MAX_OUTPUT_LEN`: Defines the maximum sequence length for both input and output tokens. It is default to be `1024`. +- `--max-context-len MAX_CONTEXT_LEN`: Defines the maximum sequence length for both input and output tokens. It is default to be `1024`. - `--max-prompt-len MAX_PROMPT_LEN`: Defines the maximum number of tokens that the input prompt can contain. It is default to be `512`. - `--disable-transpose-value-cache`: Disable the optimization of transposing value cache. diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py index 3e793f4c1e2..07faed4f1fd 100644 --- a/python/llm/src/ipex_llm/transformers/npu_model.py +++ b/python/llm/src/ipex_llm/transformers/npu_model.py @@ -166,6 +166,8 @@ def from_pretrained(cls, *args, **kwargs): logger.info(f"Converting model, it may takes up to several minutes ...") + model.config.update({"optimize_model": optimize_model}) + if mock_device == "cpu": with torch.no_grad(): # Only mock quantization_group_size=0 for now @@ -262,7 +264,6 @@ def optimize_npu_model(cls, *args, **kwargs): transpose_value_cache=transpose_value_cache, group_size=quantization_group_size ) - model.save_low_bit = types.MethodType(save_low_bit, model) else: from ipex_llm.transformers.npu_pipeline_model.convert_pipeline \ import convert_llm @@ -271,7 +272,7 @@ def optimize_npu_model(cls, *args, **kwargs): max_prompt_len=max_prompt_len, transpose_value_cache=transpose_value_cache, group_size=quantization_group_size) - + model.save_low_bit = types.MethodType(save_low_bit, model) return model @classmethod @@ -304,8 +305,10 @@ def load_low_bit(cls, pretrained_model_name_or_path: str, *model_args, **kwargs) ignore_argument(kwargs, "pipeline_parallel_stages") ignore_argument(kwargs, "mixed_precision") ignore_argument(kwargs, "quantization_group_size") - optimize_model = kwargs.pop("optimize_model", False) - max_output_len = kwargs.pop("max_output_len", 1024) + ignore_argument(kwargs, "optimize_model") + pipeline = kwargs.pop("pipeline", False) + max_context_len = kwargs.pop("max_context_len", 1024) + max_context_len = max_context_len - 1 max_prompt_len = kwargs.pop("max_prompt_len", 512) inter_pp = kwargs.pop("inter_pp", None) intra_pp = kwargs.pop("intra_pp", None) @@ -355,6 +358,7 @@ def load_low_bit(cls, pretrained_model_name_or_path: str, *model_args, **kwargs) bigdl_lcmu_enabled = config_dict.pop("bigdl_lcmu_enabled", True) mixed_precision = config_dict.pop("mixed_precision", False) quantization_group_size = config_dict.pop("group_size", 0) + optimize_model = config_dict.pop("optimize_model", False) invalidInputError( qtype, @@ -450,13 +454,12 @@ def load_low_bit(cls, pretrained_model_name_or_path: str, *model_args, **kwargs) quant_device = "meta" if bigdl_lcmu_enabled else "cpu" logger.info(f"Converting model, it may takes up to several minutes ...") from intel_npu_acceleration_library.compiler import create_npu_kernels - if optimize_model: invalidInputError( - max_prompt_len < max_output_len, + max_prompt_len < max_context_len, ( f"max_prompt_len ({max_prompt_len}) should be less" - " than max_output_len ({max_output_len})" + " than max_context_len ({max_context_len})" ), ) from ipex_llm.transformers.npu_models.convert_mp import optimize_llm_pre @@ -468,7 +471,8 @@ def load_low_bit(cls, pretrained_model_name_or_path: str, *model_args, **kwargs) with torch.no_grad(): optimize_llm_pre(model, qtype, mixed_precision, - quantization_group_size=quantization_group_size) + quantization_group_size=quantization_group_size, + load=bigdl_lcmu_enabled) cls.load_convert(qtype, model, quant_device, modules_to_not_convert, quantization_group_size, *model_args, **kwargs) create_npu_kernels(llm) @@ -541,17 +545,25 @@ def load_low_bit(cls, pretrained_model_name_or_path: str, *model_args, **kwargs) for param in model.parameters(): param.requires_grad_(False) - if optimize_model: + if optimize_model and not pipeline: from ipex_llm.transformers.npu_models.convert_mp import optimize_llm optimize_llm( llm, - max_output_len=max_output_len, + max_output_len=max_context_len, max_prompt_len=max_prompt_len, inter_pp=inter_pp, intra_pp=intra_pp, transpose_value_cache=transpose_value_cache, group_size=quantization_group_size ) + elif optimize_model and pipeline: + from ipex_llm.transformers.npu_pipeline_model.convert_pipeline \ + import convert_llm + convert_llm(llm, + kv_len=max_context_len, + max_prompt_len=max_prompt_len, + transpose_value_cache=transpose_value_cache, + group_size=quantization_group_size) return model diff --git a/python/llm/src/ipex_llm/transformers/npu_models/common.py b/python/llm/src/ipex_llm/transformers/npu_models/common.py index 92d48b0c564..0ab4f5ae444 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/common.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/common.py @@ -43,7 +43,7 @@ def reshape_lm_head_input(x): return x -def split_linear(module, module_name, n_splits=2): +def split_linear(module, module_name, n_splits=2, load=False): in_features = module.in_features invalidInputError(in_features % n_splits == 0, f"in_features of the linear layer {module_name} must be divisible by" @@ -51,17 +51,27 @@ def split_linear(module, module_name, n_splits=2): weight_split = torch.tensor_split(module.weight, n_splits, dim=1) linear_list = torch.nn.ModuleList() bias = module.bias - for idx, weight in enumerate(weight_split): - new_linear = torch.nn.Linear(weight.size(1), - weight.size(0), - bias=False if bias is None else True) - new_linear.bias = bias - new_linear.weight = torch.nn.Parameter(weight.contiguous(), requires_grad=False) - linear_list.add_module(f"{module_name}_dq_{idx}", new_linear) + from transformers.utils.generic import ContextManagers + init_contexts = [] + if load: + from transformers.modeling_utils import no_init_weights + from accelerate.big_modeling import init_empty_weights + init_contexts.append(no_init_weights(_enable=load)) + init_contexts.append(init_empty_weights()) + + with ContextManagers(init_contexts): + for idx, weight in enumerate(weight_split): + new_linear = torch.nn.Linear(weight.size(1), + weight.size(0), + bias=False if bias is None else True) + new_linear.bias = bias + new_linear.weight = torch.nn.Parameter(weight.contiguous(), requires_grad=False) + linear_list.add_module(f"{module_name}_dq_{idx}", new_linear) return linear_list -def split_linears(module: torch.nn.Module, n_splits_hidden_size=2, n_splits_down_proj=2): +def split_linears(module: torch.nn.Module, n_splits_hidden_size=2, n_splits_down_proj=2, + load=False): from transformers.models.qwen2.modeling_qwen2 import Qwen2MLP, Qwen2Attention from transformers.models.llama.modeling_llama import LlamaMLP, LlamaAttention attn_module_names = ["q_proj", "k_proj", "v_proj", "o_proj"] @@ -69,7 +79,8 @@ def split_linears(module: torch.nn.Module, n_splits_hidden_size=2, n_splits_down if isinstance(module, (Qwen2Attention, LlamaAttention)): for name in attn_module_names: setattr(module, f"{name}_dq_list", split_linear(getattr(module, name), name, - n_splits=n_splits_hidden_size)) + n_splits=n_splits_hidden_size, + load=load)) delattr(module, name) elif isinstance(module, (Qwen2MLP, LlamaMLP)): for name in mlp_module_names: @@ -77,5 +88,6 @@ def split_linears(module: torch.nn.Module, n_splits_hidden_size=2, n_splits_down if name == 'down_proj': n_splits_mlp = n_splits_down_proj setattr(module, f"{name}_dq_list", split_linear(getattr(module, name), name, - n_splits=n_splits_mlp)) + n_splits=n_splits_mlp, + load=load)) delattr(module, name) diff --git a/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py b/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py index 39999ce77f3..2c8487fb27f 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py @@ -31,7 +31,7 @@ def convert_forward(m, target_m, new_forward): def optimize_llm_pre(model: torch.nn.Module, qtype, mixed_precision, - quantization_group_size=0): + quantization_group_size=0, load=False): if model.config.model_type == "baichuan": # process NormHead module in Baichuan2 7B if hasattr(model, 'lm_head') and model.lm_head is not None: @@ -104,9 +104,9 @@ def optimize_llm_pre(model: torch.nn.Module, qtype, mixed_precision, ) n_splits_linear = model.config.hidden_size // quantization_group_size n_splits_down_proj = model.config.intermediate_size // quantization_group_size - model.apply(lambda m: split_linears(m, n_splits_hidden_size=n_splits_linear, - n_splits_down_proj=n_splits_down_proj)) + n_splits_down_proj=n_splits_down_proj, + load=load)) if quantization_group_size != 0: split_num = model.config.hidden_size // quantization_group_size