-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathOllama_eval.py
103 lines (84 loc) · 3.35 KB
/
Ollama_eval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
from langchain_ollama import OllamaLLM
from langchain.prompts import PromptTemplate
from langchain_core.runnables import RunnableSequence
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import json
import string
template = (
"You are tasked with answering the following question."
"question: {question}\n\n"
"Please follow these instructions carefully: \n\n"
"1. Only pay attention to the question in question and don't answer other questions."
"2. Provide accurate and complete answers to the best of your knowledge."
"3. Answer as short as possible."
)
# For llama 3.1 evaluation
model = OllamaLLM(model="llama3.1")
def normalize_text(text):
if isinstance(text, str):
return set(word.strip(string.punctuation).lower() for word in text.split())
return set()
def query_ollama(question):
prompt = PromptTemplate.from_template(template)
chain = prompt | model
response = chain.invoke({"question": question})
if isinstance(response, dict):
response = response.get("text", "")
return response
def ask_binary_question(question, response):
"""
Ask the model whether the response to a question is 'Yes' or 'No'.
"""
evaluation_prompt = f"""
Question: {question}
Response: {response}
Based on the response, is the answer to the question 'Yes' or 'No'? Answer with only 'Yes' or 'No'.
"""
return model(evaluation_prompt).strip()
with open('ground_truth.json', 'r') as f:
ground_truth = json.load(f)
evaluation_results = {}
i = 1
right_answer = 0
n = 20
for entry in ground_truth:
if i > n:
break
try:
question = entry["question"]
correct_answer = entry["answer"]
response = query_ollama(question)
generated_words = normalize_text(response)
correct_words = normalize_text(correct_answer)
generated_words = normalize_text(response) # Normalize the generated answer
generated_binary = ask_binary_question(question, response)
# Use the model to determine the binary answer
generated_response = generated_binary + " " + response
generated_sentence = normalize_text(generated_response)
if correct_words.issubset(generated_sentence) or generated_sentence.issubset(correct_words): # Check if all the words of the correct answer are in the generated answer
correct = 1
right_answer += 1
score = 1
else:
score = 0
# Save the question, generated response, correct answer, and similarity score
evaluation_results[question] = {
"generated_answer": generated_response,
"correct_answer": correct_answer,
"Match Score": score
}
print(f"Evaluating Question: {question}")
print(f"Generated Answer: {generated_response}")
print(f"Correct Answer: {correct_answer}")
print(f"Match Score: {score}")
print("=" * 50)
except Exception as e:
print(f"Error processing question: {question}. Error: {e}")
i += 1
total_accuracy = right_answer * 100 / (n + 1)
evaluation_results["Total Accuracy"] = total_accuracy
with open('Llama_evaluation_results_20.json', 'w') as f:
json.dump(evaluation_results, f, indent=2)
print("Evaluation complete! Results saved to `evaluation_results_20.json`.")
print("Total Accuracy:", total_accuracy)