-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
147 lines (130 loc) · 4.32 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import math
import copy
from collections import Counter
import nltk
import re
def read_cacm(path):
"""Reads CACM File and extracts the ID (I), Title (T), Authors (A) and Summary (W) (if present) of all the
documents in a dictionary"""
with open(path, 'r') as f:
data = f.read()
l = re.findall(r'(\.I(.|\n)+?(?=(\n\.I|$)))', data)
l = [x[0] for x in l]
r1 = r'\.(I) (\d+)'
r2 = r'\.(T)\n((.|\n)+?)(?=(\n\.|$))'
r3 = r'\.(A)\n((.|\n)+?)(?=(\n\.|$))'
r4 = r'\.(W)\n((.|\n)+?)(?=(\n\.|$))'
r = r'{}|{}|{}|{}'.format(r1,r2,r3,r4)
dictionary = {}
for doc in l:
x = re.findall(r, doc)
i = 0
id = None
while i < len(x):
x[i] = tuple(filter(len, x[i]))[:2]
if x[i][0] == 'I':
id = int(x[i][1])
x.pop(i)
i -= 1
i += 1
dictionary[id] = dict(x)
return dictionary
def read_cacm_query(query_path, qrels_path):
# query
with open(query_path, 'r') as f:
data = f.read()
l = re.findall(r'(\.I(.|\n)+?(?=(\n\.I|$)))', data)
l = [x[0] for x in l]
r1 = r'\.(I) (\d+)'
r3 = r'\.(A)\n((.|\n)+?)(?=(\n\.|$))'
r4 = r'\.(W)\n((.|\n)+?)(?=(\n\.|$))'
r = r'{}|{}|{}'.format(r1, r3, r4)
query_dict = {}
for doc in l:
x = re.findall(r, doc)
i = 0
id = None
while i < len(x):
x[i] = tuple(filter(len, x[i]))[:2]
if x[i][0] == 'I':
id = int(x[i][1])
x.pop(i)
i -= 1
i += 1
query_dict[id] = dict(x)
query_dict = {k: ' '.join(v.values()) for k,v in query_dict.items()}
# qrels
with open(qrels_path, 'r') as f:
data = f.readlines()
data = [x.split(' ')[:2] for x in data if len(x)]
data = [(int(x), int(y)) for x, y in data]
qrels_dict = {}
for x, y in data:
if x not in qrels_dict:
qrels_dict[x] = []
qrels_dict[x].append(y)
query_dict = {k:v for k,v in query_dict.items() if k in qrels_dict}
return query_dict, qrels_dict
def preprocess_cacm(dictionary : dict):
"""Preprocess CACM dictionary inplace : lower + remove stopwords + Count frequencies"""
stop_words = set(nltk.corpus.stopwords.words('english'))
for k in dictionary:
s = ' '.join(dictionary[k].values()).lower()
s = re.findall(r'\w+', s)
s = [x for x in s if x not in stop_words]
s = dict(Counter(s))
dictionary[k] = s
return dictionary
def inverse_dict(dictionary):
r = {}
for k in dictionary:
for term, value in dictionary[k].items():
if term not in r:
r[term] = {}
r[term][k] = value
return r
class TermDocumentDict:
def __init__(self, dictionary: dict = None):
self._max_freq_docs = {k : max(v.values()) for k, v in dictionary.items()}
self.all_documents = list(dictionary.keys())
self.dict = inverse_dict(dictionary)
self.N = len(dictionary)
def documents(self, term):
"""Get all the docuemnts that contains the term with their value"""
if term not in self.dict:
return {}
return self.dict[term]
def terms(self, document):
"""Get all the terms that are in the document with their value"""
terms = {}
for term, d in self.dict.items():
if document in d:
terms[term] = d[document]
return terms
def add(self, term, document, value):
if term not in self.dict:
self.dict[term] = {}
self.dict[term][document] = value
def __getitem__(self, key):
term, document = key
if term not in self.dict:
return 0
d = self.dict[term]
if document not in d:
return 0
return d[document]
def weight_inplace(self):
for k in self.dict:
term = self.dict[k]
ni = len(term)
for document in term:
term[document] = term[document] / self._max_freq_docs[document] * math.log10(self.N / ni + 1)
def weight(self):
"""Weight inplace the dictionary using TF-IDF formula"""
c = copy.deepcopy(self)
c.weight_inplace()
return c
def __str__(self):
return str(self.dict)
def __repr__(self):
return str(self)