diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/Model/llama2/README.md b/python/llm/example/NPU/HF-Transformers-AutoModels/Model/llama2/README.md index dc289c8cc2c..ff4a9c1c059 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/Model/llama2/README.md +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/Model/llama2/README.md @@ -21,6 +21,8 @@ pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-exte # below command will install intel_npu_acceleration_library pip install intel-npu-acceleration-library==1.3 + +pip install transformers==4.40 ``` ### 2. Runtime Configurations diff --git a/python/llm/src/ipex_llm/transformers/npu_models/llama.py b/python/llm/src/ipex_llm/transformers/npu_models/llama.py index 8a830d9e1a3..e06d61e16e8 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/llama.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/llama.py @@ -106,7 +106,7 @@ def llama_model_forward( from ipex_llm.transformers.kv import DynamicNormalCache if use_cache and not isinstance(past_key_values, DynamicNormalCache): past_key_values = DynamicNormalCache.from_legacy_cache(past_key_values) - past_seen_tokens = past_key_values.set_seq_length() + past_seen_tokens = past_key_values.get_seq_length() if cache_position is None: cache_position = torch.arange(past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1],