Skip to content

Commit

Permalink
Replace with IPEX-LLM in example comments (intel#10671)
Browse files Browse the repository at this point in the history
* Replace with IPEX-LLM in example comments

* More replacement

* revert some changes
  • Loading branch information
JinBridger authored Apr 7, 2024
1 parent 08018a1 commit 10ee786
Show file tree
Hide file tree
Showing 159 changed files with 183 additions and 183 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
import urllib.request
import os
import json
# code change to import from bigdl-llm API instead of using transformers API
# code change to import from IPEX-LLM API instead of using transformers API
from ipex_llm.transformers import AutoModelForCausalLM
from transformers import LlamaTokenizer
import intel_extension_for_pytorch as ipex
Expand Down
4 changes: 2 additions & 2 deletions python/llm/example/CPU/Deepspeed-AutoTP/deepspeed_autotp.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@
replace_method="auto"
)

# Apply BigDL-LLM INT4 optimizations on transformers
# Apply IPEX-LLM INT4 optimizations on transformers
model = optimize_model(model.module.to(f'cpu'), low_bit='sym_int4')

model = model.to(f'cpu:{local_rank}')
Expand All @@ -111,7 +111,7 @@
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
do_sample=False,
max_new_tokens=args.n_predict)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
end = time.time()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@

model_path = args.model

# Load gguf model and vocab, then convert them to bigdl-llm model and huggingface tokenizer
# Load gguf model and vocab, then convert them to IPEX-LLM model and huggingface tokenizer
model, tokenizer = AutoModelForCausalLM.from_gguf(model_path, low_bit = args.low_bit,)

# Generate predicted tokens
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
end = time.time()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
end = time.time()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
end = time.time()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
end = time.time()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
model = AutoModelForCausalLM.from_pretrained(model_path,
load_in_4bit=True,
trust_remote_code=True,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
end = time.time()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
end = time.time()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
end = time.time()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
end = time.time()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
end = time.time()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations

output = model.generate(input_ids, max_new_tokens=args.n_predict)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
end = time.time()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@
st = time.time()
# enabling `use_cache=True` allows the model to utilize the previous
# key/values attentions to speed up decoding;
# to obtain optimal performance with BigDL-LLM INT4 optimizations,
# to obtain optimal performance with IPEX-LLM INT4 optimizations,
# it is important to set use_cache=True for Dolly v1 models
output = model.generate(input_ids,
use_cache=True,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict,
pad_token_id=tokenizer.pad_token_id,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
end = time.time()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
end = time.time()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
image = Image.open(args.image_path)

# Load model
# For successful BigDL-LLM optimization on Fuyu, skip the 'vision_embed_tokens' module during optimization
# For successful IPEX-LLM optimization on Fuyu, skip the 'vision_embed_tokens' module during optimization
model = AutoModelForCausalLM.from_pretrained(model_path, device_map='cpu',
load_in_4bit = True,
trust_remote_code=True,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
end = time.time()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
image = args.image_path

# Load model
# For successful BigDL-LLM optimization on InternLM-XComposer, skip the 'qkv' module during optimization
# For successful IPEX-LLM optimization on InternLM-XComposer, skip the 'qkv' module during optimization
model = AutoModelForCausalLM.from_pretrained(model_path, device='cpu', load_in_4bit=True,
trust_remote_code=True, modules_to_not_convert=['qkv'])

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
end = time.time()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
end = time.time()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
end = time.time()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
end = time.time()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
end = time.time()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
end = time.time()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@
input_ids = tokenizer.encode(prompt, return_tensors="pt")
# enabling `use_cache=True` allows the model to utilize the previous
# key/values attentions to speed up decoding;
# to obtain optimal performance with BigDL-LLM INT4 optimizations,
# to obtain optimal performance with IPEX-LLM INT4 optimizations,
# it is important to set use_cache=True for MPT models
mpt_generation_config = GenerationConfig(
max_new_tokens=args.n_predict,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations

# Note that phi-1_5 uses GenerationConfig to enable 'use_cache'
output = model.generate(input_ids, do_sample=False, max_new_tokens=args.n_predict, generation_config = generation_config)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations

model.generation_config.pad_token_id = model.generation_config.eos_token_id
# Note that phi-2 uses GenerationConfig to enable 'use_cache'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations

# Note that phixtral uses GenerationConfig to enable 'use_cache'
output = model.generate(input_ids, do_sample=False, max_new_tokens=args.n_predict, generation_config = generation_config)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
do_sample=False,
max_new_tokens=args.n_predict)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
model_path = args.repo_id_or_model_path

# Load model
# For successful BigDL-LLM optimization on Qwen-VL-Chat, skip the 'c_fc' and 'out_proj' modules during optimization
# For successful IPEX-LLM optimization on Qwen-VL-Chat, skip the 'c_fc' and 'out_proj' modules during optimization
model = AutoModelForCausalLM.from_pretrained(model_path,
load_in_4bit=True,
device_map="cpu",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
end = time.time()
Expand Down
Loading

0 comments on commit 10ee786

Please sign in to comment.