Skip to content

Commit

Permalink
Fix the not stop issue of llama3 examples (#10860)
Browse files Browse the repository at this point in the history
* fix not stop issue in GPU/HF-Transformers-AutoModels

* fix not stop issue in GPU/PyTorch-Models/Model/llama3

* fix not stop issue in CPU/HF-Transformers-AutoModels/Model/llama3

* fix not stop issue in CPU/PyTorch-Models/Model/llama3

* update the output in readme

* update format

* add reference

* update prompt format

* update output format in readme

* update example output in readme
  • Loading branch information
hxsz1997 authored Apr 23, 2024
1 parent 5c9eb5d commit 328b1a1
Show file tree
Hide file tree
Showing 8 changed files with 66 additions and 24 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -57,12 +57,16 @@ numactl -C 0-47 -m 0 python ./generate.py
Inference time: xxxx s
-------------------- Prompt --------------------
<|begin_of_text|><|start_header_id|>user<|end_header_id|>
What is AI?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
-------------------- Output (skip_special_tokens=False) --------------------
<|begin_of_text|><|start_header_id|>user<|end_header_id|>
What is AI?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
A question that gets to the heart of the 21st century!
Artificial Intelligence (AI) refers to the development of computer systems that can perform tasks that
Artificial Intelligence (AI) refers to the development of computer systems that can perform tasks that would typically require human intelligence, such as:
1. Learning: AI
```
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,13 @@ def get_prompt(user_input: str, chat_history: list[tuple[str, str]],
prompt_texts = [f'<|begin_of_text|>']

if system_prompt != '':
prompt_texts.append(f'<|start_header_id|>system<|end_header_id|>\n{system_prompt}<|eot_id|>')
prompt_texts.append(f'<|start_header_id|>system<|end_header_id|>\n\n{system_prompt}<|eot_id|>')

for history_input, history_response in chat_history:
prompt_texts.append(f'<|start_header_id|>user<|end_header_id|>\n{history_input.strip()}<|eot_id|>')
prompt_texts.append(f'<|start_header_id|>assistant<|end_header_id|>\n{history_response.strip()}<|eot_id|>')
prompt_texts.append(f'<|start_header_id|>user<|end_header_id|>\n\n{history_input.strip()}<|eot_id|>')
prompt_texts.append(f'<|start_header_id|>assistant<|end_header_id|>\n\n{history_response.strip()}<|eot_id|>')

prompt_texts.append(f'<|start_header_id|>user<|end_header_id|>\n{user_input.strip()}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n')
prompt_texts.append(f'<|start_header_id|>user<|end_header_id|>\n\n{user_input.strip()}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n')
return ''.join(prompt_texts)

if __name__ == '__main__':
Expand All @@ -63,13 +63,20 @@ def get_prompt(user_input: str, chat_history: list[tuple[str, str]],

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

# here the terminators refer to https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct#transformers-automodelforcausallm
terminators = [
tokenizer.eos_token_id,
tokenizer.convert_tokens_to_ids("<|eot_id|>"),
]

# Generate predicted tokens
with torch.inference_mode():
prompt = get_prompt(args.prompt, [], system_prompt=DEFAULT_SYSTEM_PROMPT)
input_ids = tokenizer.encode(prompt, return_tensors="pt")
st = time.time()
output = model.generate(input_ids,
eos_token_id=terminators,
max_new_tokens=args.n_predict)
end = time.time()
output_str = tokenizer.decode(output[0], skip_special_tokens=False)
Expand Down
8 changes: 6 additions & 2 deletions python/llm/example/CPU/PyTorch-Models/Model/llama3/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,12 +57,16 @@ In the example, several arguments can be passed to satisfy your requirements:
Inference time: xxxx s
-------------------- Prompt --------------------
<|begin_of_text|><|start_header_id|>user<|end_header_id|>
What is AI?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
-------------------- Output (skip_special_tokens=False) --------------------
<|begin_of_text|><|start_header_id|>user<|end_header_id|>
What is AI?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
A question that gets to the heart of the 21st century!
Artificial Intelligence (AI) refers to the development of computer systems that can perform tasks that
Artificial Intelligence (AI) refers to the development of computer systems that can perform tasks that would typically require human intelligence, such as:
1. Learning: AI
```
15 changes: 11 additions & 4 deletions python/llm/example/CPU/PyTorch-Models/Model/llama3/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,13 @@ def get_prompt(user_input: str, chat_history: list[tuple[str, str]],
prompt_texts = [f'<|begin_of_text|>']

if system_prompt != '':
prompt_texts.append(f'<|start_header_id|>system<|end_header_id|>\n{system_prompt}<|eot_id|>')
prompt_texts.append(f'<|start_header_id|>system<|end_header_id|>\n\n{system_prompt}<|eot_id|>')

for history_input, history_response in chat_history:
prompt_texts.append(f'<|start_header_id|>user<|end_header_id|>\n{history_input.strip()}<|eot_id|>')
prompt_texts.append(f'<|start_header_id|>assistant<|end_header_id|>\n{history_response.strip()}<|eot_id|>')
prompt_texts.append(f'<|start_header_id|>user<|end_header_id|>\n\n{history_input.strip()}<|eot_id|>')
prompt_texts.append(f'<|start_header_id|>assistant<|end_header_id|>\n\n{history_response.strip()}<|eot_id|>')

prompt_texts.append(f'<|start_header_id|>user<|end_header_id|>\n{user_input.strip()}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n')
prompt_texts.append(f'<|start_header_id|>user<|end_header_id|>\n\n{user_input.strip()}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n')
return ''.join(prompt_texts)

if __name__ == '__main__':
Expand Down Expand Up @@ -65,13 +65,20 @@ def get_prompt(user_input: str, chat_history: list[tuple[str, str]],

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

# here the terminators refer to https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct#transformers-automodelforcausallm
terminators = [
tokenizer.eos_token_id,
tokenizer.convert_tokens_to_ids("<|eot_id|>"),
]

# Generate predicted tokens
with torch.inference_mode():
prompt = get_prompt(args.prompt, [], system_prompt=DEFAULT_SYSTEM_PROMPT)
input_ids = tokenizer.encode(prompt, return_tensors="pt")
st = time.time()
output = model.generate(input_ids,
eos_token_id=terminators,
max_new_tokens=args.n_predict)
end = time.time()
output_str = tokenizer.decode(output[0], skip_special_tokens=False)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -125,12 +125,14 @@ Arguments info:
Inference time: xxxx s
-------------------- Prompt --------------------
<|begin_of_text|><|start_header_id|>user<|end_header_id|>
What is AI?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
-------------------- Output (skip_special_tokens=False) --------------------
<|begin_of_text|><|start_header_id|>user<|end_header_id|>
What is AI?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
A question that gets to the heart of the 21st century!
Artificial Intelligence (AI) refers to the development of computer systems that can perform tasks that
Artificial Intelligence (AI) refers to the development of computer systems that can perform tasks that would typically require human intelligence, such as learning, problem-solving, decision
```
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,13 @@ def get_prompt(user_input: str, chat_history: list[tuple[str, str]],
prompt_texts = [f'<|begin_of_text|>']

if system_prompt != '':
prompt_texts.append(f'<|start_header_id|>system<|end_header_id|>\n{system_prompt}<|eot_id|>')
prompt_texts.append(f'<|start_header_id|>system<|end_header_id|>\n\n{system_prompt}<|eot_id|>')

for history_input, history_response in chat_history:
prompt_texts.append(f'<|start_header_id|>user<|end_header_id|>\n{history_input.strip()}<|eot_id|>')
prompt_texts.append(f'<|start_header_id|>assistant<|end_header_id|>\n{history_response.strip()}<|eot_id|>')
prompt_texts.append(f'<|start_header_id|>user<|end_header_id|>\n\n{history_input.strip()}<|eot_id|>')
prompt_texts.append(f'<|start_header_id|>assistant<|end_header_id|>\n\n{history_response.strip()}<|eot_id|>')

prompt_texts.append(f'<|start_header_id|>user<|end_header_id|>\n{user_input.strip()}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n')
prompt_texts.append(f'<|start_header_id|>user<|end_header_id|>\n\n{user_input.strip()}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n')
return ''.join(prompt_texts)

if __name__ == '__main__':
Expand Down Expand Up @@ -67,17 +67,25 @@ def get_prompt(user_input: str, chat_history: list[tuple[str, str]],
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

# here the terminators refer to https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct#transformers-automodelforcausallm
terminators = [
tokenizer.eos_token_id,
tokenizer.convert_tokens_to_ids("<|eot_id|>"),
]

# Generate predicted tokens
with torch.inference_mode():
prompt = get_prompt(args.prompt, [], system_prompt=DEFAULT_SYSTEM_PROMPT)
input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
# ipex_llm model needs a warmup, then inference time can be accurate
output = model.generate(input_ids,
eos_token_id=terminators,
max_new_tokens=args.n_predict)

# start inference
st = time.time()
output = model.generate(input_ids,
eos_token_id=terminators,
max_new_tokens=args.n_predict)
torch.xpu.synchronize()
end = time.time()
Expand Down
6 changes: 4 additions & 2 deletions python/llm/example/GPU/PyTorch-Models/Model/llama3/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -126,12 +126,14 @@ In the example, several arguments can be passed to satisfy your requirements:
Inference time: xxxx s
-------------------- Prompt --------------------
<|begin_of_text|><|start_header_id|>user<|end_header_id|>
What is AI?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
-------------------- Output (skip_special_tokens=False) --------------------
<|begin_of_text|><|start_header_id|>user<|end_header_id|>
What is AI?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
A question that gets to the heart of the 21st century!
Artificial Intelligence (AI) refers to the development of computer systems that can perform tasks that
Artificial Intelligence (AI) refers to the development of computer systems that can perform tasks that would typically require human intelligence, such as learning, problem-solving, decision
```
16 changes: 12 additions & 4 deletions python/llm/example/GPU/PyTorch-Models/Model/llama3/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,13 @@ def get_prompt(user_input: str, chat_history: list[tuple[str, str]],
prompt_texts = [f'<|begin_of_text|>']

if system_prompt != '':
prompt_texts.append(f'<|start_header_id|>system<|end_header_id|>\n{system_prompt}<|eot_id|>')
prompt_texts.append(f'<|start_header_id|>system<|end_header_id|>\n\n{system_prompt}<|eot_id|>')

for history_input, history_response in chat_history:
prompt_texts.append(f'<|start_header_id|>user<|end_header_id|>\n{history_input.strip()}<|eot_id|>')
prompt_texts.append(f'<|start_header_id|>assistant<|end_header_id|>\n{history_response.strip()}<|eot_id|>')
prompt_texts.append(f'<|start_header_id|>user<|end_header_id|>\n\n{history_input.strip()}<|eot_id|>')
prompt_texts.append(f'<|start_header_id|>assistant<|end_header_id|>\n\n{history_response.strip()}<|eot_id|>')

prompt_texts.append(f'<|start_header_id|>user<|end_header_id|>\n{user_input.strip()}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n')
prompt_texts.append(f'<|start_header_id|>user<|end_header_id|>\n\n{user_input.strip()}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n')
return ''.join(prompt_texts)

if __name__ == '__main__':
Expand Down Expand Up @@ -69,18 +69,26 @@ def get_prompt(user_input: str, chat_history: list[tuple[str, str]],

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

# here the terminators refer to https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct#transformers-automodelforcausallm
terminators = [
tokenizer.eos_token_id,
tokenizer.convert_tokens_to_ids("<|eot_id|>"),
]

# Generate predicted tokens
with torch.inference_mode():
prompt = get_prompt(args.prompt, [], system_prompt=DEFAULT_SYSTEM_PROMPT)
input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
# ipex_llm model needs a warmup, then inference time can be accurate
output = model.generate(input_ids,
eos_token_id=terminators,
max_new_tokens=args.n_predict)

# start inference
st = time.time()
output = model.generate(input_ids,
eos_token_id=terminators,
max_new_tokens=args.n_predict)
torch.xpu.synchronize()
end = time.time()
Expand Down

0 comments on commit 328b1a1

Please sign in to comment.