-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathevaluate.py
204 lines (169 loc) · 7.31 KB
/
evaluate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
"""
Copyright 2017, University of Freiburg
Chair of Algorithms and Data Structures.
Claudius Korzen <[email protected]>
Theresa Klumpp <[email protected]>
"""
import re
import argparse
import pickle
from inverted_index import InvertedIndex # NOQA
def read_benchmark(file_name):
"""
Read a benchmark from the given file. The expected format of the file
is one query per line, with the ids of all documents relevant for that
query, like: <query>TAB<id1>WHITESPACE<id2>WHITESPACE<id3> ...
>>> benchmark = read_benchmark("example-benchmark.tsv")
>>> sorted(benchmark.items())
[('animated film', {1, 3, 4}), ('short film', {3, 4})]
"""
benchmark = {}
with open(file_name, "r", encoding="utf-8") as f:
for line in f:
# Split the line into the query and the groundtruth part.
query, gt = line.strip().split('\t')
# Split the groundtruth part into ids and store them as a set.
benchmark[query] = {int(x) for x in gt.split(" ")}
return benchmark
def evaluate(ii, benchmark, verbose=True):
"""
Evaluate the given inverted index against the given benchmark as
follows. Process each query in the benchmark with the given inverted
index and compare the result list with the groundtruth in the
benchmark. For each query, compute and print (if verbose=True) the
measure P@3, P@R and AP as well as mean P@3, mean P@R and mean AP.
Return a dictionary with one entry for each query and one entry for the
mean. The keys are the keywords of the query (or "mean" for the mean) and
values are dictionaries with an entry "precision", which contains lists of
the measures. The dictionaries for the queries also contain the keys
"result_ids" (where the value is a list of document ids with the results
returned by the inverted index) and "relevant_ids" (where the value is a
set of the relevant ids for this query).
>>> ii = InvertedIndex()
>>> ii.read_from_file("example.tsv", b=0.75, k=1.75, verbose=False)
>>> benchmark = read_benchmark("example-benchmark.tsv")
>>> evaluation = evaluate(ii, benchmark, verbose=False)
>>> [round(x, 3) for x in evaluation["mean"]["precision"]]
[0.667, 0.833, 0.694]
>>> evaluation["animated film"]["result_ids"]
[2, 4, 1]
>>> sorted(list(evaluation["animated film"]["relevant_ids"]))
[1, 3, 4]
>>> [round(x, 3) for x in evaluation["animated film"]["precision"]]
[0.667, 0.667, 0.389]
"""
evaluation = {}
sum_p_at_3 = 0
sum_p_at_r = 0
sum_ap = 0
num_queries = len(benchmark)
for query, relevant_ids in benchmark.items():
if verbose:
print("Processing query '%s' ..." % query)
# Process the query by the index and fetch only the document ids.
words = [x.lower().strip() for x in re.split("[^A-Za-z]+", query)]
result_ids = [x[0] for x in ii.process_query(words)]
# Compute P@3.
p_at_3 = precision_at_k(result_ids, relevant_ids, 3)
sum_p_at_3 += p_at_3
if verbose:
print(" P@3: %.2f" % p_at_3)
# Compute P@R.
r = len(relevant_ids)
p_at_r = precision_at_k(result_ids, relevant_ids, r)
sum_p_at_r += p_at_r
if verbose:
print(" P@R: %.2f" % p_at_r)
# Compute AP.
ap = average_precision(result_ids, relevant_ids)
sum_ap += ap
if verbose:
print(" AP: %.2f" % ap)
evaluation[query] = {"precision": [p_at_3, p_at_r, ap],
"result_ids": result_ids,
"relevant_ids": relevant_ids}
# Compute MP@3.
mp_at_3 = sum_p_at_3 / num_queries
# Compute MP@R.
mp_at_r = sum_p_at_r / num_queries
# Compute MAP.
map_value = sum_ap / num_queries
evaluation["mean"] = {"precision": [mp_at_3, mp_at_r, map_value]}
if verbose:
print("Mean results:")
print(" MP@3: %s" % round(mp_at_3, 3))
print(" MP@R: %s" % round(mp_at_r, 3))
print(" MAP: %s" % round(map_value, 3))
return evaluation
def precision_at_k(result_ids, relevant_ids, k):
"""
Compute the measure P@k for the given list of result ids as it was
returned by the inverted index for a single query, and the given set of
relevant document ids.
Note that the relevant document ids are 1-based (as they reflect the
line number in the dataset file).
>>> precision_at_k([5, 3, 6, 1, 2], {1, 2, 5, 6, 7, 8}, k=0)
0
>>> precision_at_k([5, 3, 6, 1, 2], {1, 2, 5, 6, 7, 8}, k=4)
0.75
>>> precision_at_k([5, 3, 6, 1, 2], {1, 2, 5, 6, 7, 8}, k=8)
0.5
"""
if k == 0:
return 0
num_relevant_result_ids = 0
for i in range(0, min(len(result_ids), k)):
if result_ids[i] in relevant_ids:
num_relevant_result_ids += 1
return num_relevant_result_ids / k
def average_precision(result_ids, relevant_ids):
"""
Compute the average precision (AP) for the given list of result ids as
it was returned by the inverted index for a single query, and the given
set of relevant document ids.
Note that the relevant document ids are 1-based (as they reflect the
line number in the dataset file).
>>> average_precision([7, 17, 9, 42, 5], {5, 7, 12, 42})
0.525
"""
sum_ap = 0
for i in range(0, len(result_ids)):
if result_ids[i] in relevant_ids:
sum_ap += precision_at_k(result_ids, relevant_ids, i + 1)
return sum_ap / len(relevant_ids)
def main(precomputed_file, benchmark_file):
"""
Evaluate a precomputed inverted index on a benchmark.
Save the evaluation results in a pickle file in a dictionary (see
"evaluate" for more information).
"""
# Create the precomputed inverted index from the given file.
print("Reading from file '%s'..." % precomputed_file)
index = pickle.load(open(precomputed_file, "rb"))
# Read the benchmark.
print("Reading benchmark from file '%s'..." % benchmark_file)
benchmark = read_benchmark(benchmark_file)
# Evaluate the the inverted index against the benchmark.
evaluation = evaluate(index, benchmark)
new_name = (benchmark_file.replace("input", "output")
.replace(".tsv", "_")) + "evaluation.pkl"
print(f"Saving evaluation data as {new_name}.")
pickle.dump(evaluation,
open(new_name, "wb"))
if __name__ == "__main__":
# Parse the command line arguments.
parser = argparse.ArgumentParser(description="""Evaluate an inverted index
against a benchmark. Compute the measures precision at 3, precision
at R and average precision. Save the data from the evaluation using
pickle.""")
parser.add_argument("precomputed_file", type=str, help="""Pickle file
containing a precomputed inverted index. To generate such a file,
use 'inverted_index.py'.""")
parser.add_argument("benchmark_file", type=str, help="""File containing the
benchmark. The expected format of the file is one query per line,
with the ids of all documents relevant for that query, like:
<query>TAB<id1>WHITESPACE<id2>WHITESPACE<id3> ...""")
args = parser.parse_args()
precomputed_file = args.precomputed_file
benchmark_file = args.benchmark_file
main(precomputed_file, benchmark_file)