-
Notifications
You must be signed in to change notification settings - Fork 0
/
cli_demo.py
119 lines (96 loc) · 3.96 KB
/
cli_demo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import re
import os
import torch
import platform
from colorama import Fore, Style
from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
from transformers.generation.utils import GenerationConfig
from peft import PeftModel
B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
SYSTEM_PROMPT = "You are an experienced financial risk analyst. Your task is to predict whether the user is at risk of credit default based on the basic financial data given, as follows: " \
"Your answer format should be as follows:\n\n[Prediction]:\nYes or No\n\n[Analysis]:\n...\n"
def construct_prompt(prompt):
prompt = B_INST + B_SYS + SYSTEM_PROMPT + E_SYS + prompt + E_INST
return prompt
def predict(model, tokenizer, streamer, prompt):
# 打印GPU使用情况
# print_gpu_utilization()
prompt = construct_prompt(prompt)
inputs = tokenizer(prompt, return_tensors='pt', padding=False)
inputs = {key: value.to(model.device) for key, value in inputs.items()}
# print("Inputs loaded onto devices.")
res = model.generate(
**inputs, max_length=256, do_sample=True,
eos_token_id=tokenizer.eos_token_id,
use_cache=True, streamer=streamer
)
output = tokenizer.decode(res[0], skip_special_tokens=True)
answer = re.sub(r'.*\[/INST\]\s*', '', output, flags=re.DOTALL)
torch.cuda.empty_cache()
return answer
def init_model():
print("Initializing model...")
# model_path = "Go4miii/DISC-FinLLM"
# model_path = "/home/nist3/Fin4LLM/hf-models/llmaa-2-7b-chat-hf/"
model_path = "meta-llama/Llama-2-7b-chat-hf"
base_model = AutoModelForCausalLM.from_pretrained(
model_path, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True,
)
# model.generation_config = GenerationConfig.from_pretrained(model_path)
model = PeftModel.from_pretrained(base_model, 'FinGPT/fingpt-forecaster_dow30_llama2-7b_lora')
model = model.eval()
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, trust_remote_code=True)
streamer = TextStreamer(tokenizer)
return model, tokenizer, streamer
def clear_screen():
if platform.system() == "Windows":
os.system("cls")
else:
os.system("clear")
print(
Fore.YELLOW
+ Style.BRIGHT
+ "欢迎使用广州大学 FinLLM,输入进行对话,clear 清空历史,Ctrl+C 中断生成,"
+ "stream 开关流式生成,exit 结束。"
)
return []
def main(stream=True):
model, tokenizer, streamer = init_model()
messages = clear_screen()
while True:
prompt = input(Fore.GREEN + Style.BRIGHT + "\n用户:" + Style.NORMAL)
if prompt.strip() == "exit":
break
if prompt.strip() == "clear":
messages = clear_screen()
continue
print(Fore.CYAN + Style.BRIGHT + "\nFinLLM:" + Style.NORMAL, end="")
if prompt.strip() == "stream":
stream = not stream
print(
Fore.YELLOW + "({}流式生成)\n".format("开启" if stream else "关闭"),
end="",
)
continue
messages.append({"role": "user", "content": prompt})
if stream:
position = 0
try:
for response in predict(model, tokenizer, streamer, prompt):
print(response[position:], end="", flush=True)
position = len(response)
if torch.backends.mps.is_available():
torch.mps.empty_cache()
except KeyboardInterrupt:
pass
print()
else:
response = predict(model, tokenizer, streamer, prompt)
print(response)
if torch.backends.mps.is_available():
torch.mps.empty_cache()
messages.append({"role": "assistant", "content": response})
print(Style.RESET_ALL)
if __name__ == "__main__":
main()