From 11b82466e42b8a42a9f0a832faa739216f0e45a6 Mon Sep 17 00:00:00 2001 From: leonardozcm Date: Fri, 5 Jul 2024 11:19:36 +0800 Subject: [PATCH 1/3] clean branch --- python/llm/src/ipex_llm/transformers/model.py | 5 ++ .../src/ipex_llm/transformers/npu_model.py | 67 +++++-------------- 2 files changed, 23 insertions(+), 49 deletions(-) diff --git a/python/llm/src/ipex_llm/transformers/model.py b/python/llm/src/ipex_llm/transformers/model.py index 8a8105faad9..8b943ca4e33 100644 --- a/python/llm/src/ipex_llm/transformers/model.py +++ b/python/llm/src/ipex_llm/transformers/model.py @@ -332,6 +332,11 @@ def from_pretrained(cls, else: kwargs["pretraining_tp"] = 1 q_k = load_in_low_bit if load_in_low_bit else "sym_int4" + + invalidInputError(q_k not in ["sym_int4_rtn", "sym_int8_rtn"], + f"The dtype {q_k} is specified for NPU" + "and cannot be used on CPU and GPU") + imatrix_file = kwargs.pop("imatrix", None) if q_k in ["gguf_iq2_xxs", "gguf_iq2_xs", "gguf_iq1_s"]: invalidInputError(imatrix_file is not None, diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py index 8d3afed64d9..f0aaf609b7e 100644 --- a/python/llm/src/ipex_llm/transformers/npu_model.py +++ b/python/llm/src/ipex_llm/transformers/npu_model.py @@ -90,23 +90,12 @@ def from_pretrained(cls, warnings.warn("`torch_dtype` will be ignored, `torch.float` will be used") kwargs['torch_dtype'] = torch.float - low_bit = kwargs.pop('load_in_low_bit', 'fp32') - try: - # for intel_npu_acceleration_library >= 1.1.0 - from intel_npu_acceleration_library.dtypes import int8, int4 - qtype_map = { - 'sym_int4': "sym_int4_rtn", - 'sym_int8': "sym_int8_rtn", - 'fp16': torch.half, - 'fp32': torch.float, - } - except ImportError as _e: - # for intel_npu_acceleration_library < 1.1.0 - qtype_map = { - 'sym_int8': torch.int8, - 'fp16': torch.half, - 'fp32': torch.float, - } + low_bit = kwargs.pop('load_in_low_bit', 'sym_int4') + qtype_map = { + 'sym_int4': "sym_int4_rtn", + 'sym_int8': "sym_int8_rtn", + } + invalidInputError(low_bit in qtype_map.keys(), f"unsupported low_bit: {low_bit}, " f"only {list(qtype_map.keys())} are supported") @@ -143,22 +132,13 @@ def from_pretrained(cls, model.config.update({"bigdl_lcmu_enabled": False}) logger.info(f"Converting model, it may takes up to several minutes ...") - try: - # for intel_npu_acceleration_library >= 1.1.0 - from intel_npu_acceleration_library.quantization import quantize_model - from intel_npu_acceleration_library.compiler import create_npu_kernels - with torch.no_grad(): - optimize_llm(model) - if qtype in ["sym_int8_rtn", "sym_int4_rtn"]: - cls.load_convert(qtype, model, 'cpu', *args, **kwargs) - else: - if not qtype.is_floating_point: - model = quantize_model(model, qtype) - create_npu_kernels(model) - model = model.eval() - except ImportError as _e: - # for intel_npu_acceleration_library < 1.1.0 - model = npu_lib.compile(model, qtype, False) + + with torch.no_grad(): + optimize_llm(model) + cls.load_convert(qtype, model, 'cpu', *args, **kwargs) + + model = model.eval() + logger.info(f"Finish to convert model") model.config.update({"bigdl_transformers_low_bit": qtype}) @@ -313,22 +293,11 @@ def load_low_bit(cls, pretrained_model_name_or_path: str, *model_args, **kwargs) # Loading args may differ based on their usage quant_device = "meta" if bigdl_lcmu_enabled else "cpu" logger.info(f"Converting model, it may takes up to several minutes ...") - try: - # for intel_npu_acceleration_library >= 1.1.0 - from intel_npu_acceleration_library.quantization import quantize_model - from intel_npu_acceleration_library.compiler import create_npu_kernels - with torch.no_grad(): - optimize_llm(model) - if qtype in ["sym_int8_rtn", "sym_int4_rtn"]: - cls.load_convert(qtype, model, quant_device, *model_args, **kwargs) - else: - if not qtype.is_floating_point: - model = quantize_model(model, qtype) - create_npu_kernels(model) - model = model.eval() - except ImportError as _e: - # for intel_npu_acceleration_library < 1.1.0 - model = npu_lib.compile(model, qtype, False) + with torch.no_grad(): + optimize_llm(model) + cls.load_convert(qtype, model, quant_device, *model_args, **kwargs) + + model = model.eval() if is_sharded: loaded_state_dict_keys = sharded_metadata["all_checkpoint_keys"] From 3dd10231a5260c49da42afd099a1dd777af6923c Mon Sep 17 00:00:00 2001 From: leonardozcm Date: Fri, 5 Jul 2024 13:57:09 +0800 Subject: [PATCH 2/3] unused import --- python/llm/src/ipex_llm/transformers/npu_model.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py index f0aaf609b7e..d48c8be4a29 100644 --- a/python/llm/src/ipex_llm/transformers/npu_model.py +++ b/python/llm/src/ipex_llm/transformers/npu_model.py @@ -25,8 +25,6 @@ from transformers.dynamic_module_utils import get_imports from transformers.configuration_utils import PretrainedConfig -import intel_npu_acceleration_library as npu_lib - from ipex_llm.utils.common.log4Error import invalidInputError from ipex_llm.transformers.utils import logger from ipex_llm.transformers.npu_models.convert import optimize_llm From 059aa49d2c9736686f989aec3b6712564f4c4e1b Mon Sep 17 00:00:00 2001 From: leonardozcm Date: Fri, 5 Jul 2024 15:37:41 +0800 Subject: [PATCH 3/3] create_npu_kernels --- python/llm/src/ipex_llm/transformers/npu_model.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py index d48c8be4a29..2a3ecffcda6 100644 --- a/python/llm/src/ipex_llm/transformers/npu_model.py +++ b/python/llm/src/ipex_llm/transformers/npu_model.py @@ -131,9 +131,11 @@ def from_pretrained(cls, logger.info(f"Converting model, it may takes up to several minutes ...") + from intel_npu_acceleration_library.compiler import create_npu_kernels with torch.no_grad(): optimize_llm(model) cls.load_convert(qtype, model, 'cpu', *args, **kwargs) + create_npu_kernels(model) model = model.eval() @@ -291,9 +293,11 @@ def load_low_bit(cls, pretrained_model_name_or_path: str, *model_args, **kwargs) # Loading args may differ based on their usage quant_device = "meta" if bigdl_lcmu_enabled else "cpu" logger.info(f"Converting model, it may takes up to several minutes ...") + from intel_npu_acceleration_library.compiler import create_npu_kernels with torch.no_grad(): optimize_llm(model) cls.load_convert(qtype, model, quant_device, *model_args, **kwargs) + create_npu_kernels(model) model = model.eval()