-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathevaluation.py
137 lines (120 loc) · 4.88 KB
/
evaluation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import json
import streamlit as st
from langchain_ollama import OllamaLLM
import string
import numpy as np
from parse import (
search,
extract_body_content,
clean_body_content,
split_content,
parse
)
# Initialize the model
model = OllamaLLM(model="llama3.1")
# Load the ground_truth file (questions and answers)
with open('ground_truth.json', 'r') as f:
ground_truth = json.load(f)
# Prepare a dictionary to store model outputs and similarity scores
evaluation_results = {}
total_perplexity = 0
i = 1
right_answer = 0
n = 5
def calculate_perplexity(text):
words = text.split()
probabilities = np.full(len(words), 1 / len(words)) # Assume uniform distribution
log_prob = np.log(probabilities)
perplexity = np.exp(-np.mean(log_prob))
return perplexity
def normalize_text(text):
# Remove punctuation and convert to lowercase
return set(word.strip(string.punctuation).lower() for word in text.split())
# # Function to determine if the question expects a binary answer
# def is_yes_no_question(question):
# """
# Check if the question is likely to expect a Yes/No answer.
# """
# yes_no_indicators = ["is", "are", "does", "do", "can", "could", "should", "would", "will", "was", "were", "did"]
# return question.strip().lower().split()[0] in yes_no_indicators
def jaccard_similarity(set1, set2):
"""
Calculate Jaccard similarity between two sets.
"""
intersection = len(set1 & set2)
union = len(set1 | set2)
return intersection / union if union != 0 else 0
def ask_binary_question(question, response):
"""
Ask the model whether the response to a question is 'Yes' or 'No'.
"""
evaluation_prompt = f"""
Question: {question}
Response: {response}
Based on the response, is the answer to the question 'Yes' or 'No'? Answer with only 'Yes' or 'No'.
"""
return model.invoke(evaluation_prompt).strip()
# Loop through the questions in the ground_truth file
for entry in ground_truth:
if i > n:
break
try:
input_question = entry["question"]
correct_answer = entry["answer"]
print("This is the question:", input_question)
question = input_question.replace(" ", "+")
# content = search(f"https://www.google.com/search?q={question}")
# Avoid scraping Google directly (Use an alternative)
url = f"https://lite.duckduckgo.com/lite?q={question}"
content = search(url)
body_content = extract_body_content(content)
cleaned_content = clean_body_content(body_content)
st.session_state.content = cleaned_content
chunks = split_content(st.session_state.content)
response = parse(chunks, question)
correct_words = normalize_text(correct_answer) # Normalize the correct answer
generated_words = normalize_text(response) # Normalize the generated answer
generated_binary = ask_binary_question(question, response)
# Use the model to determine the binary answer
generated_response = generated_binary + " " + response
generated_sentence = normalize_text(generated_response)
if correct_words.issubset(generated_sentence) or generated_sentence.issubset(correct_words): # Check if all the words of the correct answer are in the generated answer
correct = 1
right_answer += 1
score = 1
else:
score = 0
# # Calculate similarity score
# similarity_score = jaccard_similarity(correct_words, generated_sentence)
# if similarity_score > 0.9: # Check if similarity score is above 0.9
# correct = 1
# right_answer += 1
# score = 1
# else:
# score = 0
# Save the question, generated response, correct answer, and similarity score
evaluation_results[question] = {}
perplexity = calculate_perplexity(response)
evaluation_results[question]["Perplexity"] = perplexity
evaluation_results[question] = {
"generated_answer": generated_response,
"correct_answer": correct_answer,
"Match Score": score,
"Perplexity": perplexity
}
print(f"Generated Answer: {generated_response}")
print(f"Correct Answer: {correct_answer}")
print(f"Match Score: {score}")
print(f"Perplexity: {perplexity}")
print("=" * 50)
total_perplexity = total_perplexity + perplexity
except Exception as e:
print(f"Error processing question: {question}. Error: {e}")
i += 1
total_accuracy = right_answer * 100 / (n + 1)
evaluation_results["Total Accuracy"] = total_accuracy
evaluation_results["Total Perplexity"] = total_perplexity
with open('evaluation_results_5.json', 'w') as f:
json.dump(evaluation_results, f, indent=2)
print("Evaluation complete! Results saved to `evaluation_results_5.json`.")
print("Total Accuracy:", total_accuracy)