-
Notifications
You must be signed in to change notification settings - Fork 1
/
ipex-llm-multi-test.py
158 lines (144 loc) · 5.33 KB
/
ipex-llm-multi-test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import os
os.system('pip install accelerate')
os.system('pip install ipex-llm[all]')
os.system('pip install modelscope -U')
os.system('pip install --upgrade transformers==4.37.0')
os.system('pip install transformers_stream_generator')
from threading import Thread
from typing import Iterator
import gc
import gradio as gr
import torch
from modelscope import AutoModelForCausalLM, AutoModel, AutoTokenizer
from transformers import TextIteratorStreamer
from ipex_llm import optimize_model
MAX_MAX_NEW_TOKENS = 2048
DEFAULT_MAX_NEW_TOKENS = 128
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
CHATGLM_IDS = ['ZhipuAI/chatglm-6b', 'ZhipuAI/chatglm2-6b', 'ZhipuAI/chatglm3-6b']
model_options = ['qwen/Qwen1.5-1.8B-Chat', 'qwen/Qwen1.5-0.5B-Chat', 'ZhipuAI/chatglm3-6b']
# model_id = model_options[0]
# model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="auto")
# tokenizer = AutoTokenizer.from_pretrained(model_id)
# tokenizer.use_default_system_prompt = False
# model = optimize_model(model)
model = None
tokenizer = None
def change_model(model_id):
global model, tokenizer, chat_history
try:
if model_id in CHATGLM_IDS:
model = AutoModel.from_pretrained(model_id, torch_dtype="auto", trust_remote_code=True)
else:
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
# tokenizer.use_default_system_prompt = False
model = optimize_model(model)
gc.collect()
# chat_interface.clear_btn.click()
return f"Successfully loaded {model_id}"
except Exception as e:
return f"Error loading model: {str(e)}"
def generate(
message: str,
chat_history: list[tuple[str, str]],
system_prompt: str,
max_new_tokens: int = 1024,
temperature: float = 0.6,
top_p: float = 0.9,
top_k: int = 50,
repetition_penalty: float = 1.2,
) -> Iterator[str]:
conversation = []
if system_prompt:
conversation.append({"role": "system", "content": system_prompt})
for user, assistant in chat_history:
conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
conversation.append({"role": "user", "content": message})
with torch.inference_mode():
input_ids = tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
input_ids = tokenizer([input_ids],return_tensors="pt").to(model.device)
streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
generate_kwargs = dict(
input_ids=input_ids.input_ids,
streamer=streamer,
max_new_tokens=max_new_tokens,
do_sample=True,
top_p=top_p,
top_k=top_k,
temperature=temperature,
repetition_penalty=repetition_penalty,
)
t = Thread(target=model.generate, kwargs=generate_kwargs)
t.start()
outputs = []
for text in streamer:
outputs.append(text)
yield "".join(outputs)
#outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(outputs)
#yield outputs
chat_interface = gr.ChatInterface(
fn=generate,
additional_inputs=[
gr.Textbox(label="System prompt", lines=6),
gr.Slider(
label="Max new tokens",
minimum=1,
maximum=MAX_MAX_NEW_TOKENS,
step=1,
value=DEFAULT_MAX_NEW_TOKENS,
),
gr.Slider(
label="Temperature",
minimum=0.1,
maximum=4.0,
step=0.1,
value=0.6,
),
gr.Slider(
label="Top-p (nucleus sampling)",
minimum=0.05,
maximum=1.0,
step=0.05,
value=0.9,
),
gr.Slider(
label="Top-k",
minimum=1,
maximum=1000,
step=1,
value=50,
),
gr.Slider(
label="Repetition penalty",
minimum=1.0,
maximum=2.0,
step=0.05,
value=1.2,
),
],
stop_btn=None,
examples=[
["你好!你是谁?"],
["请简单介绍一下大语言模型?"],
["请讲一个小人物成功的故事."],
["浙江的省会在哪里?"],
["写一篇100字的文章,题目是'人工智能开源的优势'"],
],
)
with gr.Blocks(css="style.css") as demo:
gr.Markdown("""<center><font size=8>ipex-llm-test for multi-models👾</center>""")
gr.Markdown("""<center><font size=6>Please clear chatbot before loading new model</center>""")
with gr.Row():
with gr.Column(scale=3):
model_option = gr.Dropdown(choices=model_options, label="Model_ID")
with gr.Column(scale=1):
modify_model = gr.Button("🛠️ Load model", scale=2)
with gr.Row():
info = gr.Textbox(lines=1, value="No model loaded", label="Model Info")
# model_option.input(fn=change_model, inputs=[model_option])
chat_interface.render()
modify_model.click(fn=change_model, inputs=[model_option], outputs=[info])
if __name__ == "__main__":
demo.queue(max_size=20).launch()