-
Notifications
You must be signed in to change notification settings - Fork 34
/
token_replacement.py
84 lines (69 loc) · 1.88 KB
/
token_replacement.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
class token_replacement(object):
"""
Changes common tokens to standard replacements:
('&', ' and ')
('%', ' percent ')
('>', ' greater-than ')
('<', ' less-than ')
('=', ' equals ')
('#', ' ')
('~', ' ')
('/', ' ')
('\\', ' ')
('|', ' ')
('$', '')
# Remove empty colons
(' : ', ' ')
# Remove double dashes
('--', ' ')
# Remove possesive splits
(" 's ", ' ')
# Remove quotes
("'", '')
('"', '')
"""
def __init__(self, remove=False):
""" Initialize the parser. """
self.replace_dict = {
"&": " and ",
"%": " percent ",
">": " greater-than ",
"<": " less-than ",
"=": " equals ",
"#": " ",
"~": " ",
"/": " ",
"\\": " ",
"|": " ",
"$": "",
# Remove empty :
" : ": " ",
# Remove double dashes
"--": " ",
# Remove possesive splits
" 's ": " ",
# Remove quotes
"'": "",
'"': "",
}
if remove:
for key in self.replace_dict:
self.replace_dict[key] = " "
def __call__(self, text):
"""
Runs the parser.
Args:
text: A string document
Returns:
text: The document with common extraneous punctuation removed.
"""
for key, val in self.replace_dict.items():
text = text.replace(key, val)
# Remove blank tokens, but keep line breaks
doc = [
" ".join([token for token in line.split()])
for line in text.split("\n")
]
# Remove blank lines
doc = "\n".join(filter(None, doc))
return doc