-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnormalization.py
58 lines (51 loc) · 1.55 KB
/
normalization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import unicodedata
# Updated replacement map that converts Elizabethan to modern English
replacement_map = {
"Æ": "ae", "æ": "ae",
"ſ": "s",
"ꝛ": "r",
"j": "i",
"v": "u",
"th": "y",
"Þ": "th", "þ": "th",
"Ð": "d", "ð": "d",
"ƿ": "w",
"Ƿ": "w",
"ꝣ": "d",
"Ꝺ": "d",
"Ȝ": "y", "ȝ": "y",
"ß": "ss",
"Œ": "oe", "œ": "oe",
"Ꝋ": "o", "ꝋ": "o",
"Ꝑ": "p", "ꝑ": "p",
"Ꝙ": "q", "ꝙ": "q",
"ꝺ": "d"
}
def normalize_text(text):
if not text:
return text
# Convert to lowercase and remove diacritics
text = unicodedata.normalize('NFKD', text)
text = ''.join(c for c in text if not unicodedata.combining(c))
text = text.lower()
# Character substitutions
text = text.replace('j', 'i')
text = text.replace('v', 'u')
text = text.replace('ye', 'the')
text = text.replace('æ', 'ae')
# Common spelling variations
text = text.replace('tragedie', 'tragedy')
text = text.replace('comedie', 'comedy')
text = text.replace('historie', 'history')
text = text.replace('ſ', 's') # long s
text = text.replace('haviour', 'havior')
text = text.replace('honour', 'honor')
text = text.replace('labour', 'labor')
text = text.replace('griefe', 'grief')
text = text.replace('loue', 'love')
text = text.replace('publike', 'public')
text = text.replace('musicke', 'music')
text = text.replace('magicke', 'magic')
text = text.replace('worke', 'work')
text = text.replace('booke', 'book')
return text