-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprecision.py
140 lines (126 loc) · 5.45 KB
/
precision.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import argparse
import pandas as pd
import numpy as np
from typing import List, Tuple
def read_file(tsv_file: str) -> Tuple[List, pd.DataFrame]:
"""
Reads the input 4-column cosine similarity existing pairs TSV file in a pandas dataframe, generates all the unique PMIDs
and returns the dataframe.
Parameters
----------
tsv_file : str
File path to the 4-column cosine similarity existing pairs TSV file.
Returns
-------
ref_pmids : list
List of all unique PMIDs.
data : pd.Dataframe
Pandas Dataframe cosisting of 4 columns: PMID1, PMID2, Relevance, cosine similarity.
"""
colnames = ["PMID1", "PMID2", "Relevance", "Cosine Similarity"]
data = pd.read_csv(tsv_file, sep='\t', header=0, names=colnames)
ref_pmids = data["PMID1"].unique()
return ref_pmids, data
def sort_collection(pmid: str, data: pd.DataFrame) -> pd.DataFrame:
"""
Sorts the input dataframe for the given PMID based on the cosine similarity values in the descending order.
Parameters
----------
pmid : str
PMID for which the collection needs to be sorted.
data : pd.Datafarme
Pandas Dataframe cosisting of 4 columns: PMID1, PMID2, Relevance, cosine similarity.
Returns
-------
sorted_collection : pd.Dataframe
Sorted Pandas Dataframe based on the given PMID .
"""
collection = data[data['PMID1'] == pmid]
sorted_collection = collection.sort_values(['PMID1', "Cosine Similarity"],
ascending=[True, False], ignore_index=True)
return sorted_collection
def calculate_precision(sorted_collection: pd.DataFrame, n: int, multi_class: bool) -> float:
"""
Calculates the precision score for the input sorted_collection at given n value.
Parameters
----------
sorted_collection : pd.Dataframe
Sorted Pandas Dataframe based on the given PMID .
n : int
Value of n at which precision is to be calculated.
multi_class : bool
Defines whether to take into account multiple classes for the precision score.
Returns
-------
precision_n : float
Value of Precision@n.
"""
top_n = sorted_collection[:n]
if multi_class:
# 3-classes solution
true_positives_n = len(top_n[(top_n["Relevance"] == 2)])
else:
true_positives_n = len(top_n[(top_n["Relevance"] == 2) | (
top_n["Relevance"] == 1)]) # 2-classes solution
precision_n = round(true_positives_n/n, 4)
return precision_n
def generate_matrix(ref_pmids: list, data: pd.DataFrame, multi_class: bool) -> np.array:
"""
Wrapper function to generate the precision matrix at the given values of n for every unique PMID in the input data.
Parameters
----------
ref_pmids : list
List of all unique PMIDs.
data : pd.Dataframe
Pandas Dataframe cosisting of 4 columns: PMID1, PMID2, Relevance, cosine similarity.
multi_class : bool
Defines whether to take into account multiple classes for the precision score.
Returns
-------
precision_matrix : np.array
Generated precision matrix.
"""
value_of_n = [5, 10, 15, 20, 25, 50]
precision_matrix = np.empty(shape=(len(ref_pmids), len(value_of_n)))
for pmid_index, pmid in enumerate(ref_pmids):
sorted_collection = sort_collection(pmid, data)
for index, n in enumerate(value_of_n):
precision_n = calculate_precision(
sorted_collection, n, multi_class)
precision_matrix[pmid_index][index] = precision_n
return precision_matrix
def write_to_tsv(ref_pmids: list, precision_matrix: np.array, output_filepath: str):
"""
Write the generated precision matrix to a TSV file and computes the avergae of the precision@n scores.
Parameters
----------
ref_pmids: list
List of all unique PMIDs.
precision_matrix : np.array
Generated precision matrix.
output_filepath : str
File path to save the TSV file.
"""
matrix = pd.DataFrame(precision_matrix, columns=[
'P@5', 'P@10', 'P@15', 'P@20', 'P@25', 'P@50'])
matrix.insert(0, 'PMIDs', ref_pmids)
# Calculate and append average of each precision score
average_values = ['Average'] + list(matrix[['P@5', 'P@10', 'P@15', 'P@20', 'P@25', 'P@50']]
.mean(axis=0).round(4))
matrix.loc[len(matrix.index)] = average_values
pd.DataFrame(matrix).to_csv(output_filepath, sep="\t")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-i", "--input_path",
help="File path to the 4-column cosine similarity RELISH TSV file.", required=True)
parser.add_argument("-o", "--output_path", help="File path to save the precision matrix",
required=True)
parser.add_argument("-m", "--multiple_classes", help="If 1, apply the 3-class approach, if 0 apply the 2-class approach of considering partially-relevant articles to be positive.",
required=True)
args = parser.parse_args()
ref_pmids, data = read_file(args.input_path)
multi_class = True
if args.multiple_classes == "0":
multi_class = False
matrix = generate_matrix(ref_pmids, data, multi_class)
write_to_tsv(ref_pmids, matrix, args.output_path)