-
Notifications
You must be signed in to change notification settings - Fork 0
/
Normalizer.py
93 lines (78 loc) · 3.31 KB
/
Normalizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
from nltk.tokenize.stanford import StanfordTokenizer
from nltk.tokenize.treebank import TreebankWordTokenizer
from english_dictionary.scripts.read_pickle import get_dict
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet
import string
import csv
class Normalizer():
#the normalizer works on tokenized text
#applies lowercasing if enabled
#removes all tokens that are punctuation marks
#expands clitics
#non-standard words will be identified by looking up each token in an english dictionary
def __init__(self, file_path):
self.file_path = file_path
self.dictionary = self.read_csv()
self.clitics_dictionary = {
"'ll": "will",
"'d": "would/had",
"'s": "is/has",
"'re": "are",
"'ve": "have",
"'m": "am",
"n't": "not"
}
def read_csv(self):
with open(self.file_path, 'r') as file:
reader = csv.reader(file)
data_list = [row[0] for row in reader]
return data_list
def returnClosestWord(self, token):
distance = 1000
closestWord = ""
for word in self.dictionary:
currentDistance = self.levenshtein(token, word)
if currentDistance < distance:
distance = currentDistance
closestWord = word
return closestWord
# https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#Python
def levenshtein(self, word1, word2):
if len(word1) < len(word2):
return self.levenshtein(word2, word1)
# len(word1) >= len(word2)
if len(word2) == 0:
return len(word1)
previous_row = range(len(word2) + 1)
for i, c1 in enumerate(word1):
current_row = [i + 1]
for j, c2 in enumerate(word2):
insertions = previous_row[
j + 1] + 1 # j+1 instead of j since previous_row and current_row are one character longer
deletions = current_row[j] + 1 # than word2
substitutions = previous_row[j] + (c1 != c2)
current_row.append(min(insertions, deletions, substitutions))
previous_row = current_row
return previous_row[-1]
def normalize(self, token_list, lowercaseEnabled):
#convert to lowercase if enabled
if lowercaseEnabled:
token_list = [token.lower() for token in token_list]
#remove all punctuation tokens
token_list = [token for token in token_list if token not in string.punctuation]
#expand clitics
token_list = [self.clitics_dictionary.get(token, token) for token in token_list]
#replace misspelled words by their most similar one from the dictionary
normalized_tokens = []
for token in token_list:
if token in self.clitics_dictionary:
normalized_tokens.append(self.clitics_dictionary.get(token, token))
if not token in self.dictionary:
#print(token + " is not a standard word")
substitution = self.returnClosestWord(token)
normalized_tokens.append(substitution)
#print(substitution + " is the conversion")
else: normalized_tokens.append(token)
return normalized_tokens