-
Notifications
You must be signed in to change notification settings - Fork 34
/
replace_acronyms.py
294 lines (236 loc) · 9.58 KB
/
replace_acronyms.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
import nlpre.identify_parenthetical_phrases as IPP
import collections
import logging
from . import nlp
class replace_acronyms(object):
"""
Replaces acronyms and abbreviations found in a document with their
corresponding phrase. A counter dictionary is passed in __init__
to determine phrases.
If an acronym is explicitly identified with a phrase in a document, then
all instances of that acronym in the document will be replaced with the
given phrase.
Example:
input: "The Environmental Protection Agency (EPA) protects trees. The
EPA was created by Nixon"
output: "The Environmental Protection Agency
(Environmental_Protection_Agency) protects trees. The
Environmental_Protection_Agency was created by Nixon"
If there is no explicit indication what the phrase is within the document,
then the most common phrase associated with the acronym in the given
counter is used.
Example:
input: "The EPA protects trees"
output: "The Environmental_Protection_Agency protects trees"
"""
def __init__(
self,
counter=None,
prefix=None,
suffix=None,
underscore=True,
preprocessed=False,
use_most_common=False,
):
"""
Initialize the parser, the acronym dictionary, and flags
Args:
counter: A counter object of acronyms and their counts found in a
larger corpus
underscore: A boolean to indicate whether to insert phrases
as a single underscored token
preprocessed: A boolean to indicate if input text is raw,
or has been processed by other NLPre modules
use_most_common: A boolean to indicate if the phrase is replaced
by the most common one, regardless if it is found
within the current document
"""
self.logger = logging.getLogger(__name__)
self.prefix = prefix
self.suffix = suffix
self.underscore = underscore
self.preprocessed = preprocessed
self.use_most_common = use_most_common
self.IPP = IPP.identify_parenthetical_phrases()
self.acronym_dict = {}
self.counter = counter
self._build_acronym_dict(counter)
def _build_acronym_dict(self, counter):
if counter is None:
return
for acronym_tuple, count in counter.items():
if acronym_tuple[1] in self.acronym_dict:
self.acronym_dict[acronym_tuple[1]][acronym_tuple[0]] = count
else:
results = collections.Counter()
results[acronym_tuple[0]] = count
self.acronym_dict[acronym_tuple[1]] = results
def check_self_counter(self, token, doc_counter):
"""
Check if an acronym token is defined within the document
Args:
token: a string token
doc_counter: a counter object of acronyms defined within a document
Returns:
a boolean
"""
for acronym_tuple in doc_counter.keys():
if acronym_tuple[1] == token:
highest_phrase = list(acronym_tuple[0])
return highest_phrase
return False
def word_belongs(self, word, acronym_phrases):
"""
Return false if a word cannot belong to any abbreviated phrase.
Since words with dashes are turned into a list of their
individual words, we must treat all words as a list. So, single
tokens are converted to single item lists.
Args:
word: a string token
acronym_phrases: a 2D list of string tokens
Returns:
a boolean
"""
for acronym_phrase in acronym_phrases:
if isinstance(word, str):
wordlist = [word]
else:
wordlist = list(word)
word_included = all(x not in acronym_phrases for x in wordlist)
if word_included:
return True
return False
def check_acronym(self, token):
"""
Check if a token is an acronym to be replaced
Args:
token: a string token
Returns:
a boolean
"""
if token.lower() == token:
return False
return token in self.acronym_dict
def check_phrase(self, token, pos, tokens, counter):
"""
Determine if a series of tokens, starting with the input token, form
an acronym phrase found in the counter. If the token does not form
a phrase, the function returns as false.
Args:
token: a string token
pos: an int
tokens: a list of string tokens
counter: a counter of acronym's and associated phrases
Returns:
output: a string the tokens that make up an acronym phrases,
connected by an underline
If no valid output exists, a boolean False is returned
"""
if not self.underscore:
return False
acronym_phrases = []
for acronym_tuple in counter.keys():
acronym_phrases.append(list(acronym_tuple[0]))
phrase = []
word = token
length = 0
while pos < len(tokens) - 1:
length += 1
if "-" in word:
word = word.split("-")
phrase.extend(word)
else:
phrase.append(word)
if not self.word_belongs(word, acronym_phrases):
return False
if phrase in acronym_phrases:
output = ("_".join(phrase), length - 1)
return output
else:
pos += 1
word = tokens[pos]
return False
def __call__(self, document, doc_counter=None):
"""
Identify and replace all acronyms in the document
Args:
document: a string
doc_counter: a counter object of acronyms defined within a
document. If missing, identify_parenthetical_phrases
is run.
Returns:
new_doc: a string
"""
if doc_counter is None:
doc_counter = self.IPP(document)
self._build_acronym_dict(doc_counter)
if self.preprocessed:
parsed = document.split("\n")
else:
document = " ".join(document.strip().split())
parsed = [sent for sent in nlp(document, disable=["tagger"]).sents]
new_doc = []
for sentence in parsed:
if self.preprocessed:
tokens = sentence.split()
else:
tokens = [x.text for x in sentence]
new_sentence = []
index = -1
while index < len(tokens) - 1:
index += 1
token = tokens[index]
if self.check_acronym(token):
# check if acronym is used within document
highest_phrase = self.check_self_counter(token, doc_counter)
# If not found, replace with the original token
if not highest_phrase and not self.use_most_common:
new_sentence.append(token)
continue
if not highest_phrase:
acronym_counts = self.acronym_dict[token]
highest_phrase = list(
acronym_counts.most_common(1)[0][0]
)
if self.underscore and self.prefix:
highest_phrase.insert(0, self.prefix)
if self.underscore and self.suffix:
highest_phrase.append(self.suffix)
if self.underscore:
highest_phrase = "_".join(highest_phrase)
self.logger.info(
"Replacing token %s with phrase %s"
% (token, highest_phrase)
)
new_sentence.append(highest_phrase)
else:
self.logger.info(
"Replacing token %s with phrase %s"
% (token, highest_phrase)
)
new_sentence.extend(highest_phrase)
continue
# Note: this code is case sensitive. Tokens will not be
# recognized as being acronym phrases if their capitalization
# is different from the phrases found in identify_parenthetical
# _phrases.py
#
# This is particularly an issue with titlecaps.py, which might
# force phrase tokens to lowercase
tokenized_phrase = self.check_phrase(
token, index, tokens, doc_counter
)
if tokenized_phrase:
phrase = tokenized_phrase[0]
if self.prefix:
phrase = "_".join([self.prefix, phrase])
if self.suffix:
phrase = "_".join([phrase, self.suffix])
self.logger.info("Tokenizing phrase %s" % phrase)
new_sentence.append(phrase)
index += tokenized_phrase[1]
else:
new_sentence.append(token)
new_doc.append(" ".join(new_sentence))
new_doc = "\n".join(new_doc)
return new_doc