diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/baichuan2.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/baichuan2.py index 70efab141e6..7c07cc93351 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/baichuan2.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/baichuan2.py @@ -25,19 +25,6 @@ logger = logging.get_logger(__name__) -def get_prompt(message: str, chat_history: list[tuple[str, str]], - system_prompt: str) -> str: - texts = [f'[INST] <>\n{system_prompt}\n<>\n\n'] - # The first user input is _not_ stripped - do_strip = False - for user_input, response in chat_history: - user_input = user_input.strip() if do_strip else user_input - do_strip = True - texts.append(f'{user_input} [/INST] {response.strip()} [INST] ') - message = message.strip() if do_strip else message - texts.append(f'{message} [/INST]') - return ''.join(texts) - if __name__ == "__main__": parser = argparse.ArgumentParser( description="Predict Tokens using `generate()` API for npu model" @@ -108,11 +95,15 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]], with torch.inference_mode(): print("finish to load") for i in range(3): - prompt = get_prompt(args.prompt, [], system_prompt=DEFAULT_SYSTEM_PROMPT) - _input_ids = tokenizer.encode(prompt, return_tensors="pt") + messages = [{"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": args.prompt}] + text = tokenizer.apply_chat_template(messages, + tokenize=False, + add_generation_prompt=True) + _input_ids = tokenizer([text], return_tensors="pt").input_ids print("-" * 20, "Input", "-" * 20) print("input length:", len(_input_ids[0])) - print(prompt) + print(args.prompt) print("-" * 20, "Output", "-" * 20) st = time.time() output = model.generate(