-
Notifications
You must be signed in to change notification settings - Fork 34
/
Copy pathurl_replacement.py
46 lines (34 loc) · 1.14 KB
/
url_replacement.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
from . import nlp
class url_replacement(object):
"""
Removes (or replaces) URLs and emails within a document.
Uses spaCy to determine if like email or url.
"""
def __init__(self, email_replacement="", url_replacement=""):
"""
Initialize the parser.
"""
self.email_replacement = email_replacement
self.url_replacement = url_replacement
def __call__(self, text):
"""
Runs the parser.
Args:
text: A string document
Returns:
text: The document with links removed or replaced
"""
text = " ".join(text.strip().split())
doc = []
for token in nlp(text):
if token.like_url:
if self.url_replacement:
doc.append(self.url_replacement)
doc.append(token.whitespace_)
elif token.like_email:
if self.email_replacement:
doc.append(self.email_replacement)
doc.append(token.whitespace_)
else:
doc.append(token.text_with_ws)
return "".join(doc)