-
Notifications
You must be signed in to change notification settings - Fork 0
/
evaluation.py
118 lines (96 loc) · 5.04 KB
/
evaluation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# -*- coding: utf-8 -*-
from tqdm import tqdm
from scripts import io
from scripts.configs import BaseConfig
from scripts.dataset import format_context, build_gpt4_user_prompt, build_gpt4_user_prompt_eval2
from scripts.utils import call_gpt4_api, paper_structure_score, word_count_score
FOLD_NO = 3
def check_gpt4_eval_key(prefix, keys):
required_keys = [prefix + str(idx + 1) for idx in range(FOLD_NO)]
for req_key in required_keys:
found = False
for key in keys:
if req_key == key:
found = True
break
if not found:
return True, req_key
return False, ""
if __name__ == '__main__':
args = BaseConfig().get_args()
inf_paths = [
"assets/gpt4-vanilla.json",
"assets/vanila.json",
args.llm_warmup_inf,
args.rlhf_style_warmup_inf,
args.rlhf_style_inf,
args.rlhf_style_gpt4_warmup_inf,
args.rlhf_style_gpt4_inf,
]
reward_vocab = io.read_text(args.reward_vocab).split("\n")
reward_vocab = [vocab.lower() for vocab in reward_vocab]
for inf_path in inf_paths:
finetuned_inf = io.read_json(inf_path)
df = io.read_csv(args.orkg_synthesis_test)
print("size of the dataset is: ", len(finetuned_inf))
for idx in tqdm(range(len(finetuned_inf))):
if "llama" in inf_path:
synthesis = finetuned_inf[idx]['inference'].replace("userassistant", " ").replace("useruser", " ")
else:
synthesis = finetuned_inf[idx]['inference']
# if "gpt-4-eval-s1" not in list(finetuned_inf[idx].keys()):
while False:
do_eval, eval_key = check_gpt4_eval_key(prefix="gpt-4-eval-s1-", keys=list(finetuned_inf[idx].keys()))
print(f"Eval-key: {eval_key}, for item {idx}")
if not do_eval:
break
row = df[df['sample_id'] == finetuned_inf[idx]['sample_id']]
research_problem = row['research_problem'].tolist()[0]
synthesis_type = finetuned_inf[idx]['synthesis']
context = format_context(row)
user_prompt = build_gpt4_user_prompt(synthesis=synthesis, research_problem=research_problem, synthesis_type=synthesis_type, context=context)
message = [{"role": "system", "content": args.eval1_system_prompt_problem},
{"role": "user", "content": user_prompt}]
response, completion_tokens, prompt_tokens = call_gpt4_api(message)
finetuned_inf[idx][eval_key] = {
"eval-result": response,
"completion-tokens": completion_tokens,
"prompt-tokens": prompt_tokens
}
io.write_json(output_path=inf_path, json_data=finetuned_inf)
# if "gpt-4-eval-s2" not in list(finetuned_inf[idx].keys()):
while False:
do_eval, eval_key = check_gpt4_eval_key(prefix="gpt-4-eval-s2-", keys=list(finetuned_inf[idx].keys()))
print(f"Eval-key: {eval_key}, for item {idx}")
if not do_eval:
break
row = df[df['sample_id'] == finetuned_inf[idx]['sample_id']]
research_problem = row['research_problem'].tolist()[0]
synthesis_type = finetuned_inf[idx]['synthesis']
context = format_context(row)
user_prompt = build_gpt4_user_prompt_eval2(synthesis=synthesis, research_problem=research_problem, synthesis_type=synthesis_type, context=context)
responses = {}
completion_tokens = 0
prompt_tokens = 0
for charactrestic_dict in args.eval2_characteristic_lst:
system_prompt = args.eval2_system_prompt_problem
for key, val in charactrestic_dict.items():
system_prompt = system_prompt.replace(key, val)
message = [{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt}]
response, completion_token, prompt_token = call_gpt4_api(message, max_tokens=150)
responses[charactrestic_dict['<characteristic-title>']] = response[list(response.keys())[0]]
completion_tokens += completion_token
prompt_tokens += prompt_token
finetuned_inf[idx][eval_key] = {
"eval-result": responses,
"completion-tokens": completion_tokens,
"prompt-tokens": prompt_tokens
}
io.write_json(output_path=inf_path, json_data=finetuned_inf)
if "basic-eval" not in list(finetuned_inf[idx].keys()):
finetuned_inf[idx]['basic-eval'] = {
"paper-structure": paper_structure_score(synthesis, reward_vocab),
"word-count": word_count_score(synthesis)
}
io.write_json(output_path=inf_path, json_data=finetuned_inf)