diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/baichuan2.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/baichuan2.py index f3e3ddbc0cc..70efab141e6 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/baichuan2.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/baichuan2.py @@ -79,6 +79,8 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]], transpose_value_cache=not args.disable_transpose_value_cache, trust_remote_code=True, save_directory=args.save_directory) + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + tokenizer.save_pretrained(args.save_directory) else: model = AutoModelForCausalLM.load_low_bit( args.save_directory, @@ -90,8 +92,8 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]], transpose_value_cache=not args.disable_transpose_value_cache, trust_remote_code=True ) + tokenizer = AutoTokenizer.from_pretrained(args.save_directory, trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) if args.disable_streaming: streamer = None diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama2.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama2.py index cb640bc7b05..d11b1891e35 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama2.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama2.py @@ -78,6 +78,8 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]], attn_implementation="eager", transpose_value_cache=not args.disable_transpose_value_cache, save_directory=args.save_directory) + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + tokenizer.save_pretrained(args.save_directory) else: model = AutoModelForCausalLM.load_low_bit( args.save_directory, @@ -88,8 +90,8 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]], pipeline=True, transpose_value_cache=not args.disable_transpose_value_cache, ) + tokenizer = AutoTokenizer.from_pretrained(args.save_directory, trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) if args.disable_streaming: streamer = None diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama3.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama3.py index ac3433b92b4..baf923374af 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama3.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama3.py @@ -84,6 +84,8 @@ def get_prompt(user_input: str, chat_history: list[tuple[str, str]], attn_implementation="eager", transpose_value_cache=not args.disable_transpose_value_cache, save_directory=args.save_directory) + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + tokenizer.save_pretrained(args.save_directory) else: model = AutoModelForCausalLM.load_low_bit( args.save_directory, @@ -94,8 +96,8 @@ def get_prompt(user_input: str, chat_history: list[tuple[str, str]], pipeline=True, transpose_value_cache=not args.disable_transpose_value_cache, ) + tokenizer = AutoTokenizer.from_pretrained(args.save_directory, trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) if args.disable_streaming: streamer = None diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/minicpm.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/minicpm.py index df5bd756c99..fe2868c292b 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/minicpm.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/minicpm.py @@ -66,6 +66,8 @@ transpose_value_cache=not args.disable_transpose_value_cache, trust_remote_code=True, save_directory=args.save_directory) + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + tokenizer.save_pretrained(args.save_directory) else: model = AutoModelForCausalLM.load_low_bit( args.save_directory, @@ -77,8 +79,8 @@ transpose_value_cache=not args.disable_transpose_value_cache, trust_remote_code=True ) + tokenizer = AutoTokenizer.from_pretrained(args.save_directory, trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) if args.disable_streaming: streamer = None diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/qwen.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/qwen.py index ef5ded70896..e1f4be49f90 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/qwen.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/qwen.py @@ -70,6 +70,8 @@ mixed_precision=True, trust_remote_code=True, save_directory=args.save_directory) + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + tokenizer.save_pretrained(args.save_directory) else: model = AutoModelForCausalLM.load_low_bit( args.save_directory, @@ -79,8 +81,8 @@ max_prompt_len=args.max_prompt_len, pipeline=True, transpose_value_cache=not args.disable_transpose_value_cache) + tokenizer = AutoTokenizer.from_pretrained(args.save_directory, trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) if args.disable_streaming: streamer = None diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/baichuan2.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/baichuan2.py index 05c47076ede..cdf26af179b 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/baichuan2.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/baichuan2.py @@ -79,6 +79,8 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]], transpose_value_cache=not args.disable_transpose_value_cache, save_directory=args.save_directory ) + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + tokenizer.save_pretrained(args.save_directory) else: model = AutoModelForCausalLM.load_low_bit( args.save_directory, @@ -90,8 +92,8 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]], transpose_value_cache=not args.disable_transpose_value_cache, trust_remote_code=True, ) + tokenizer = AutoTokenizer.from_pretrained(args.save_directory, trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) DEFAULT_SYSTEM_PROMPT = """\ """ diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/generate.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/generate.py index 41a14e1a32b..d3abd13a6e6 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/generate.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/generate.py @@ -43,7 +43,6 @@ args = parser.parse_args() model_path = args.repo_id_or_model_path - tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) if not args.lowbit_path or not os.path.exists(args.lowbit_path): model = AutoModelForCausalLM.from_pretrained( @@ -52,6 +51,8 @@ load_in_low_bit=args.load_in_low_bit, attn_implementation="eager" ) + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + tokenizer.save_pretrained(args.lowbit_path) else: model = AutoModelForCausalLM.load_low_bit( args.lowbit_path, @@ -59,6 +60,7 @@ bigdl_transformers_low_bit=args.load_in_low_bit, attn_implementation="eager" ) + tokenizer = AutoTokenizer.from_pretrained(args.lowbit_path, trust_remote_code=True) print(model) diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama2.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama2.py index 83fe6d899eb..d981f39f97e 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama2.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama2.py @@ -79,6 +79,8 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]], transpose_value_cache=not args.disable_transpose_value_cache, save_directory=args.save_directory ) + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + tokenizer.save_pretrained(args.save_directory) else: model = AutoModelForCausalLM.load_low_bit( args.save_directory, @@ -89,8 +91,8 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]], max_prompt_len=args.max_prompt_len, transpose_value_cache=not args.disable_transpose_value_cache, ) + tokenizer = AutoTokenizer.from_pretrained(args.save_directory, trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) DEFAULT_SYSTEM_PROMPT = """\ """ diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama3.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama3.py index 85cca7fd6db..35ee4902246 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama3.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama3.py @@ -80,6 +80,8 @@ def get_prompt(user_input: str, chat_history: list[tuple[str, str]], transpose_value_cache=not args.disable_transpose_value_cache, save_directory=args.save_directory ) + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + tokenizer.save_pretrained(args.save_directory) else: model = AutoModelForCausalLM.load_low_bit( args.save_directory, @@ -90,8 +92,8 @@ def get_prompt(user_input: str, chat_history: list[tuple[str, str]], max_prompt_len=args.max_prompt_len, transpose_value_cache=not args.disable_transpose_value_cache, ) + tokenizer = AutoTokenizer.from_pretrained(args.save_directory, trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) DEFAULT_SYSTEM_PROMPT = """\ """ diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/minicpm.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/minicpm.py index 5ec0bf7289c..b177042cc2b 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/minicpm.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/minicpm.py @@ -65,6 +65,8 @@ transpose_value_cache=not args.disable_transpose_value_cache, save_directory=args.save_directory ) + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + tokenizer.save_pretrained(args.save_directory) else: model = AutoModelForCausalLM.load_low_bit( args.save_directory, @@ -76,7 +78,7 @@ transpose_value_cache=not args.disable_transpose_value_cache, trust_remote_code=True, ) - tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(args.save_directory, trust_remote_code=True) print("-" * 80) print("done") diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/qwen.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/qwen.py index 9f03c908b96..caf6d1b3e54 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/qwen.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/qwen.py @@ -71,6 +71,8 @@ quantization_group_size=args.quantization_group_size, save_directory=args.save_directory ) + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + tokenizer.save_pretrained(args.save_directory) else: model = AutoModelForCausalLM.load_low_bit( args.save_directory, @@ -81,8 +83,8 @@ max_prompt_len=args.max_prompt_len, transpose_value_cache=not args.disable_transpose_value_cache, ) + tokenizer = AutoTokenizer.from_pretrained(args.save_directory, trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) print("-" * 80) print("done")