Replace with IPEX-LLM in example comments (intel#10671)

* Replace with IPEX-LLM in example comments * More replacement * revert some changes
joan726 · Apr 7, 2024 · 10ee786 · 10ee786
1 parent 08018a1
commit 10ee786
Show file tree

Hide file tree

Showing 159 changed files with 183 additions and 183 deletions.
diff --git a/python/llm/example/CPU/Applications/streaming-llm/streaming_llm/utils.py b/python/llm/example/CPU/Applications/streaming-llm/streaming_llm/utils.py
@@ -48,7 +48,7 @@
 import urllib.request
 import os
 import json
-# code change to import from bigdl-llm API instead of using transformers API
+# code change to import from IPEX-LLM API instead of using transformers API
 from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import LlamaTokenizer
 import intel_extension_for_pytorch as ipex

diff --git a/python/llm/example/CPU/Deepspeed-AutoTP/deepspeed_autotp.py b/python/llm/example/CPU/Deepspeed-AutoTP/deepspeed_autotp.py
@@ -87,7 +87,7 @@
         replace_method="auto"
     )
 
-    # Apply BigDL-LLM INT4 optimizations on transformers
+    # Apply IPEX-LLM INT4 optimizations on transformers
     model = optimize_model(model.module.to(f'cpu'), low_bit='sym_int4')
 
     model = model.to(f'cpu:{local_rank}')
@@ -111,7 +111,7 @@
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 do_sample=False,
                                 max_new_tokens=args.n_predict)

diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/AWQ/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/AWQ/generate.py
@@ -59,7 +59,7 @@
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         end = time.time()

diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/generate.py
@@ -44,7 +44,7 @@
 
     model_path = args.model
 
-    # Load gguf model and vocab, then convert them to bigdl-llm model and huggingface tokenizer
+    # Load gguf model and vocab, then convert them to IPEX-LLM model and huggingface tokenizer
     model, tokenizer = AutoModelForCausalLM.from_gguf(model_path, low_bit = args.low_bit,)
 
     # Generate predicted tokens

diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GPTQ/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GPTQ/generate.py
@@ -60,7 +60,7 @@
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         end = time.time()

diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila/generate.py
@@ -56,7 +56,7 @@
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         end = time.time()

diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila2/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila2/generate.py
@@ -56,7 +56,7 @@
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         end = time.time()

diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan/generate.py
@@ -56,7 +56,7 @@
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         end = time.time()

diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py
@@ -45,7 +45,7 @@
     # if your selected model is capable of utilizing previous key/value attentions
     # to enhance decoding speed, but has `"use_cache": false` in its model config,
     # it is important to set `use_cache=True` explicitly in the `generate` function
-    # to obtain optimal performance with BigDL-LLM INT4 optimizations
+    # to obtain optimal performance with IPEX-LLM INT4 optimizations
     model = AutoModelForCausalLM.from_pretrained(model_path,
                                                  load_in_4bit=True,
                                                  trust_remote_code=True,

diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/bluelm/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/bluelm/generate.py
@@ -56,7 +56,7 @@
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         end = time.time()

diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm/generate.py
@@ -57,7 +57,7 @@
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         end = time.time()

diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py
@@ -57,7 +57,7 @@
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         end = time.time()

diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm3/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm3/generate.py
@@ -57,7 +57,7 @@
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         end = time.time()

diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/codellama/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/codellama/generate.py
@@ -55,7 +55,7 @@
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         end = time.time()

diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/codeshell/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/codeshell/generate.py
@@ -56,7 +56,7 @@
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
 
         output = model.generate(input_ids, max_new_tokens=args.n_predict)
 

diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/deepseek/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/deepseek/generate.py
@@ -61,7 +61,7 @@
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         end = time.time()

diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v1/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v1/generate.py
@@ -61,7 +61,7 @@
         st = time.time()
         # enabling `use_cache=True` allows the model to utilize the previous
         # key/values attentions to speed up decoding;
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations,
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations,
         # it is important to set use_cache=True for Dolly v1 models
         output = model.generate(input_ids,
                                 use_cache=True,

diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v2/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v2/generate.py
@@ -64,7 +64,7 @@
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict,
                                 pad_token_id=tokenizer.pad_token_id,

diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/falcon/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/falcon/generate.py
@@ -57,7 +57,7 @@
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         end = time.time()

diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/flan-t5/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/flan-t5/generate.py
@@ -60,7 +60,7 @@
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         end = time.time()

diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/fuyu/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/fuyu/generate.py
@@ -38,7 +38,7 @@
     image = Image.open(args.image_path)
 
     # Load model
-    # For successful BigDL-LLM optimization on Fuyu, skip the 'vision_embed_tokens' module during optimization
+    # For successful IPEX-LLM optimization on Fuyu, skip the 'vision_embed_tokens' module during optimization
     model = AutoModelForCausalLM.from_pretrained(model_path, device_map='cpu',
                                                  load_in_4bit = True,
                                                  trust_remote_code=True,

diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/gemma/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/gemma/generate.py
@@ -61,7 +61,7 @@
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         end = time.time()

diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm-xcomposer/chat.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm-xcomposer/chat.py
@@ -37,7 +37,7 @@
     image = args.image_path
 
     # Load model
-    # For successful BigDL-LLM optimization on InternLM-XComposer, skip the 'qkv' module during optimization
+    # For successful IPEX-LLM optimization on InternLM-XComposer, skip the 'qkv' module during optimization
     model = AutoModelForCausalLM.from_pretrained(model_path, device='cpu', load_in_4bit=True,
                                                  trust_remote_code=True, modules_to_not_convert=['qkv'])
 

diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm/generate.py
@@ -57,7 +57,7 @@
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         end = time.time()

diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm2/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm2/generate.py
@@ -56,7 +56,7 @@
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         end = time.time()

diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/llama2/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/llama2/generate.py
@@ -60,7 +60,7 @@
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         end = time.time()

diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mistral/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mistral/generate.py
@@ -55,7 +55,7 @@
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         end = time.time()

diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mixtral/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mixtral/generate.py
@@ -61,7 +61,7 @@
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         end = time.time()

diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/moss/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/moss/generate.py
@@ -56,7 +56,7 @@
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         end = time.time()

diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mpt/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mpt/generate.py
@@ -55,7 +55,7 @@
         input_ids = tokenizer.encode(prompt, return_tensors="pt")
         # enabling `use_cache=True` allows the model to utilize the previous
         # key/values attentions to speed up decoding;
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations,
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations,
         # it is important to set use_cache=True for MPT models
         mpt_generation_config = GenerationConfig(
             max_new_tokens=args.n_predict, 

diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-1_5/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-1_5/generate.py
@@ -58,7 +58,7 @@
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
 
         # Note that phi-1_5 uses GenerationConfig to enable 'use_cache'
         output = model.generate(input_ids, do_sample=False, max_new_tokens=args.n_predict, generation_config = generation_config)

diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-2/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-2/generate.py
@@ -58,7 +58,7 @@
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
 
         model.generation_config.pad_token_id = model.generation_config.eos_token_id
         # Note that phi-2 uses GenerationConfig to enable 'use_cache'

diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phixtral/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phixtral/generate.py
@@ -58,7 +58,7 @@
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
 
         # Note that phixtral uses GenerationConfig to enable 'use_cache'
         output = model.generate(input_ids, do_sample=False, max_new_tokens=args.n_predict, generation_config = generation_config)

diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phoenix/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phoenix/generate.py
@@ -55,7 +55,7 @@
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 do_sample=False,
                                 max_new_tokens=args.n_predict)

diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen-vl/chat.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen-vl/chat.py
@@ -36,7 +36,7 @@
     model_path = args.repo_id_or_model_path  
 
     # Load model
-    # For successful BigDL-LLM optimization on Qwen-VL-Chat, skip the 'c_fc' and 'out_proj' modules during optimization
+    # For successful IPEX-LLM optimization on Qwen-VL-Chat, skip the 'c_fc' and 'out_proj' modules during optimization
     model = AutoModelForCausalLM.from_pretrained(model_path, 
                                                  load_in_4bit=True, 
                                                  device_map="cpu", 

diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen/generate.py
@@ -64,7 +64,7 @@
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         end = time.time()