-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathpreprocess_text.py
86 lines (64 loc) · 2.5 KB
/
preprocess_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import re
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
# create list of stop words
stop = get_stop_words('en')
# remove non-alphanumeric, non-space
stop = [re.sub(r'([^\s\w]|_)+', '', x) for x in stop]
# add in custom stop words
days = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']
months = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december']
other = ['nan', 'podcast']
[stop.append(unicode(day)) for day in days]
[stop.append(unicode(month)) for month in months]
[stop.append(unicode(x)) for x in other]
def remove_stop(text, stop):
new_text = []
for word in text:
if word not in stop:
new_text.append(word)
return new_text
# create tokenizer
tokenizer = RegexpTokenizer(r'\w+')
# create stemmer
p_stemmer = PorterStemmer()
def stem_list(text, p_stemmer):
new_list = []
for word in text:
new_list.append(p_stemmer.stem(word))
return new_list
def preprocess_text(text):
# remove mixed alphanumeric
text = re.sub(r"""(?x) # verbose regex
\b # Start of word
(?= # Look ahead to ensure that this word contains...
\w* # (after any number of alphanumeric characters)
\d # ...at least one digit.
) # End of lookahead
\w+ # Match the alphanumeric word
\s* # Match any following whitespace""",
"", text)
# remove urls (will check and remove http and www later)
text = re.sub(r'\s([\S]*.com[\S]*)\b', '', text)
text = re.sub(r'\s([\S]*.org[\S]*)\b', '', text)
text = re.sub(r'\s([\S]*.net[\S]*)\b', '', text)
text = re.sub(r'\s([\S]*.edu[\S]*)\b', '', text)
text = re.sub(r'\s([\S]*.gov[\S]*)\b', '', text)
# remove non-alphanumeric, non-space
text = re.sub(r'([^\s\w]|_)+', '', text)
# tokenize text
text = tokenizer.tokenize(text.lower())
# remove stop words
text = remove_stop(text, stop)
# stem
text = stem_list(text, p_stemmer)
# remove instances of http or www
new_text = []
for word in text:
if re.search(r'http', word):
continue
if re.search(r'www', word):
continue
new_text.append(word)
return new_text