From d409d9d0eb978a4ced7c534ea24e65b1a76d6673 Mon Sep 17 00:00:00 2001 From: binbin Deng <108676127+plusbang@users.noreply.github.com> Date: Fri, 1 Nov 2024 15:38:10 +0800 Subject: [PATCH] [NPU L0] Update streaming mode of example (#12312) --- .../LLM/Pipeline-Models/README.md | 21 ++++++---------- .../LLM/Pipeline-Models/baichuan2.py | 24 ++++++++++++------- .../LLM/Pipeline-Models/llama2.py | 24 ++++++++++++------- .../LLM/Pipeline-Models/llama3.py | 24 ++++++++++++------- .../LLM/Pipeline-Models/minicpm.py | 24 ++++++++++++------- .../LLM/Pipeline-Models/qwen.py | 24 ++++++++++++------- .../npu_pipeline_model/convert_pipeline.py | 4 +--- 7 files changed, 83 insertions(+), 62 deletions(-) diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/README.md b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/README.md index e3569496184..d30ce356d29 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/README.md +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/README.md @@ -72,28 +72,21 @@ Arguments info: - `--max-context-len MAX_CONTEXT_LEN`: Defines the maximum sequence length for both input and output tokens. It is default to be `1024`. - `--max-prompt-len MAX_PROMPT_LEN`: Defines the maximum number of tokens that the input prompt can contain. It is default to be `512`. - `--disable-transpose-value-cache`: Disable the optimization of transposing value cache. +- `--disable-streaming`: Disable streaming mode of generation. -### Sample Output +### Sample Output of Streaming Mode #### [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) - + ```log - Number of input tokens: 28 - Generated tokens: 32 - First token generation time: xxxx s - Generation average latency: xxxx ms, (xxxx token/s) - Generation time: xxxx s - -Inference time: xxxx s -------------------- Input -------------------- - [INST] <> +input length: 28 +[INST] <> <> What is AI? [/INST] -------------------- Output -------------------- - [INST] <> + AI (Artificial Intelligence) is a field of computer science and technology that focuses on the development of intelligent machines that can perform -<> - -What is AI? [/INST] AI (Artificial Intelligence) is a field of computer science and technology that focuses on the development of intelligent machines that can perform +Inference time: xxxx s ``` diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/baichuan2.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/baichuan2.py index 3217b1a4681..96c77fb1923 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/baichuan2.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/baichuan2.py @@ -20,7 +20,7 @@ import time import argparse from ipex_llm.transformers.npu_model import AutoModelForCausalLM -from transformers import AutoTokenizer +from transformers import AutoTokenizer, TextStreamer from transformers.utils import logging logger = logging.get_logger(__name__) @@ -61,6 +61,7 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]], parser.add_argument("--max-context-len", type=int, default=1024) parser.add_argument("--max-prompt-len", type=int, default=512) parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False) + parser.add_argument("--disable-streaming", action="store_true", default=False) args = parser.parse_args() model_path = args.repo_id_or_model_path @@ -92,6 +93,11 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]], if args.lowbit_path and not os.path.exists(args.lowbit_path): model.save_low_bit(args.lowbit_path) + if args.disable_streaming: + streamer = None + else: + streamer = TextStreamer(tokenizer=tokenizer, skip_special_tokens=True) + DEFAULT_SYSTEM_PROMPT = """\ """ @@ -99,22 +105,22 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]], print("done") with torch.inference_mode(): print("finish to load") - for i in range(5): + for i in range(3): prompt = get_prompt(args.prompt, [], system_prompt=DEFAULT_SYSTEM_PROMPT) _input_ids = tokenizer.encode(prompt, return_tensors="pt") + print("-" * 20, "Input", "-" * 20) print("input length:", len(_input_ids[0])) + print(prompt) + print("-" * 20, "Output", "-" * 20) st = time.time() output = model.generate( - _input_ids, max_new_tokens=args.n_predict, do_print=True + _input_ids, max_new_tokens=args.n_predict, streamer=streamer ) end = time.time() + if args.disable_streaming: + output_str = tokenizer.decode(output[0], skip_special_tokens=False) + print(output_str) print(f"Inference time: {end-st} s") - input_str = tokenizer.decode(_input_ids[0], skip_special_tokens=False) - print("-" * 20, "Input", "-" * 20) - print(input_str) - output_str = tokenizer.decode(output[0], skip_special_tokens=False) - print("-" * 20, "Output", "-" * 20) - print(output_str) print("-" * 80) print("done") diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama2.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama2.py index 6f36051f36b..c7168bcb4b9 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama2.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama2.py @@ -20,7 +20,7 @@ import time import argparse from ipex_llm.transformers.npu_model import AutoModelForCausalLM -from transformers import AutoTokenizer +from transformers import AutoTokenizer, TextStreamer from transformers.utils import logging logger = logging.get_logger(__name__) @@ -62,6 +62,7 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]], parser.add_argument("--max-prompt-len", type=int, default=512) parser.add_argument("--quantization_group_size", type=int, default=0) parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False) + parser.add_argument("--disable-streaming", action="store_true", default=False) args = parser.parse_args() model_path = args.repo_id_or_model_path @@ -91,6 +92,11 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]], if args.lowbit_path and not os.path.exists(args.lowbit_path): model.save_low_bit(args.lowbit_path) + + if args.disable_streaming: + streamer = None + else: + streamer = TextStreamer(tokenizer=tokenizer, skip_special_tokens=True) DEFAULT_SYSTEM_PROMPT = """\ """ @@ -99,22 +105,22 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]], print("done") with torch.inference_mode(): print("finish to load") - for i in range(5): + for i in range(3): prompt = get_prompt(args.prompt, [], system_prompt=DEFAULT_SYSTEM_PROMPT) _input_ids = tokenizer.encode(prompt, return_tensors="pt") + print("-" * 20, "Input", "-" * 20) print("input length:", len(_input_ids[0])) + print(prompt) + print("-" * 20, "Output", "-" * 20) st = time.time() output = model.generate( - _input_ids, max_new_tokens=args.n_predict, do_print=True + _input_ids, max_new_tokens=args.n_predict, streamer=streamer ) end = time.time() + if args.disable_streaming: + output_str = tokenizer.decode(output[0], skip_special_tokens=False) + print(output_str) print(f"Inference time: {end-st} s") - input_str = tokenizer.decode(_input_ids[0], skip_special_tokens=False) - print("-" * 20, "Input", "-" * 20) - print(input_str) - output_str = tokenizer.decode(output[0], skip_special_tokens=False) - print("-" * 20, "Output", "-" * 20) - print(output_str) print("-" * 80) print("done") diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama3.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama3.py index eed246d439d..a837e03c6f3 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama3.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama3.py @@ -20,7 +20,7 @@ import time import argparse from ipex_llm.transformers.npu_model import AutoModelForCausalLM -from transformers import AutoTokenizer +from transformers import AutoTokenizer, TextStreamer from transformers.utils import logging logger = logging.get_logger(__name__) @@ -68,6 +68,7 @@ def get_prompt(user_input: str, chat_history: list[tuple[str, str]], parser.add_argument("--max-prompt-len", type=int, default=512) parser.add_argument("--quantization_group_size", type=int, default=0) parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False) + parser.add_argument("--disable-streaming", action="store_true", default=False) args = parser.parse_args() model_path = args.repo_id_or_model_path @@ -98,26 +99,31 @@ def get_prompt(user_input: str, chat_history: list[tuple[str, str]], if args.lowbit_path and not os.path.exists(args.lowbit_path): model.save_low_bit(args.lowbit_path) + if args.disable_streaming: + streamer = None + else: + streamer = TextStreamer(tokenizer=tokenizer, skip_special_tokens=True) + print("-" * 80) print("done") with torch.inference_mode(): print("finish to load") - for i in range(5): + for i in range(3): prompt = get_prompt(args.prompt, [], system_prompt=DEFAULT_SYSTEM_PROMPT) _input_ids = tokenizer.encode(prompt, return_tensors="pt") + print("-" * 20, "Input", "-" * 20) print("input length:", len(_input_ids[0])) + print(prompt) + print("-" * 20, "Output", "-" * 20) st = time.time() output = model.generate( - _input_ids, max_new_tokens=args.n_predict, do_print=True + _input_ids, max_new_tokens=args.n_predict, streamer=streamer ) end = time.time() + if args.disable_streaming: + output_str = tokenizer.decode(output[0], skip_special_tokens=False) + print(output_str) print(f"Inference time: {end-st} s") - input_str = tokenizer.decode(_input_ids[0], skip_special_tokens=False) - print("-" * 20, "Input", "-" * 20) - print(input_str) - output_str = tokenizer.decode(output[0], skip_special_tokens=False) - print("-" * 20, "Output", "-" * 20) - print(output_str) print("-" * 80) print("done") diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/minicpm.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/minicpm.py index 9cd01218852..a84f78a74b0 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/minicpm.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/minicpm.py @@ -19,7 +19,7 @@ import time import argparse from ipex_llm.transformers.npu_model import AutoModelForCausalLM -from transformers import AutoTokenizer +from transformers import AutoTokenizer, TextStreamer from transformers.utils import logging import os @@ -48,6 +48,7 @@ parser.add_argument("--max-context-len", type=int, default=1024) parser.add_argument("--max-prompt-len", type=int, default=512) parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False) + parser.add_argument("--disable-streaming", action="store_true", default=False) args = parser.parse_args() model_path = args.repo_id_or_model_path @@ -79,26 +80,31 @@ if args.lowbit_path and not os.path.exists(args.lowbit_path): model.save_low_bit(args.lowbit_path) + if args.disable_streaming: + streamer = None + else: + streamer = TextStreamer(tokenizer=tokenizer, skip_special_tokens=True) + print("-" * 80) print("done") with torch.inference_mode(): print("finish to load") - for i in range(5): + for i in range(3): prompt = "<用户>{}".format(args.prompt) _input_ids = tokenizer.encode(prompt, return_tensors="pt") + print("-" * 20, "Input", "-" * 20) print("input length:", len(_input_ids[0])) + print(prompt) + print("-" * 20, "Output", "-" * 20) st = time.time() output = model.generate( - _input_ids, max_new_tokens=args.n_predict, do_print=True + _input_ids, max_new_tokens=args.n_predict, streamer=streamer ) end = time.time() + if args.disable_streaming: + output_str = tokenizer.decode(output[0], skip_special_tokens=False) + print(output_str) print(f"Inference time: {end-st} s") - input_str = tokenizer.decode(_input_ids[0], skip_special_tokens=False) - print("-" * 20, "Input", "-" * 20) - print(input_str) - output_str = tokenizer.decode(output[0], skip_special_tokens=False) - print("-" * 20, "Output", "-" * 20) - print(output_str) print("-" * 80) print("done") diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/qwen.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/qwen.py index 54338da6493..0055b248482 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/qwen.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/qwen.py @@ -20,7 +20,7 @@ import time import argparse from ipex_llm.transformers.npu_model import AutoModelForCausalLM -from transformers import AutoTokenizer +from transformers import AutoTokenizer, TextStreamer from transformers.utils import logging logger = logging.get_logger(__name__) @@ -50,6 +50,7 @@ parser.add_argument('--load_in_low_bit', type=str, default="sym_int4", help='Load in low bit to use') parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False) + parser.add_argument("--disable-streaming", action="store_true", default=False) args = parser.parse_args() model_path = args.repo_id_or_model_path @@ -81,6 +82,11 @@ if args.lowbit_path and not os.path.exists(args.lowbit_path): model.save_low_bit(args.lowbit_path) + if args.disable_streaming: + streamer = None + else: + streamer = TextStreamer(tokenizer=tokenizer, skip_special_tokens=True) + print("-" * 80) print("done") messages = [{"role": "system", "content": "You are a helpful assistant."}, @@ -90,21 +96,21 @@ add_generation_prompt=True) with torch.inference_mode(): print("finish to load") - for i in range(5): + for i in range(3): _input_ids = tokenizer([text], return_tensors="pt").input_ids + print("-" * 20, "Input", "-" * 20) print("input length:", len(_input_ids[0])) + print(text) + print("-" * 20, "Output", "-" * 20) st = time.time() output = model.generate( - _input_ids, max_new_tokens=args.n_predict, do_print=True + _input_ids, max_new_tokens=args.n_predict, streamer=streamer ) end = time.time() + if args.disable_streaming: + output_str = tokenizer.decode(output[0], skip_special_tokens=False) + print(output_str) print(f"Inference time: {end-st} s") - input_str = tokenizer.decode(_input_ids[0], skip_special_tokens=False) - print("-" * 20, "Input", "-" * 20) - print(input_str) - output_str = tokenizer.decode(output[0], skip_special_tokens=False) - print("-" * 20, "Output", "-" * 20) - print(output_str) print("-" * 80) print("done") diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py index 39760cf9eee..5efeb3b3ee3 100644 --- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py +++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py @@ -134,7 +134,6 @@ def generate( try: input_pipe = open(in_pipe_path, "wb") except: - print('Waiting for input pipe') time.sleep(1) else: break @@ -143,7 +142,6 @@ def generate( try: output_pipe = open(out_pipe_path, "rb") except: - print('Waiting for output pipe') time.sleep(1) else: break @@ -152,7 +150,7 @@ def generate( bdata = str.encode(str(temp_dir)) invalidInputError(len(bdata) <= 2000, - f"Leng of input directory is too long ({len(bdata)}), " + f"Length of input directory is too long ({len(bdata)}), " "which may cause read error.") input_pipe.write(bdata) input_pipe.flush()