-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcorrection_tools.py
219 lines (181 loc) · 7.08 KB
/
correction_tools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
import re
import numpy as np
from spellchecker import SpellChecker
from string import ascii_letters, printable, punctuation
from collections import Counter
"""
TODOS:
1. Test different parameters
2. Add corrections based on literature
3. Automatically determine best parameters to be used
4. Work around lines with special characters
"""
def find_special_char(word):
"""
This function checks if a word contains any special characters.
Parameters:
word (str): The word to be checked.
Returns:
bool: True if the word contains a special character, False otherwise.
"""
regexp = re.compile('[^0-9a-zA-Z-À-ÿ]+', re.UNICODE)
return bool(regexp.search(word))
def correct_line(line, dict):
"""
Correct the spelling mistakes in a string `line`.
The function first splits the string into words, then iterates over the words.
If the word contains non-letters, the original line is returned.
Otherwise, the word is corrected using a spell checker. If the word is
capitalized and not an acronym, the corrected word is capitalized as well.
If the word is uppercase, the corrected word is returned in uppercase.
The corrected word replaces the original word in the line.
Parameters:
----------
line (str): The input string to be corrected.
dictionary (txt): Corpus used to determine best words
Returns:
-------
line (str): The corrected string with spelling mistakes corrected.
"""
spell = SpellChecker(language='es')
spell.word_frequency.load_text_file(str(dict))
special_characters = "!@#$%^&*()-+?_=,<>/"
for idx, val in enumerate(line.split()):
# Skip the word if it contains non-letter characters
if find_special_char(val) == True:
continue
# Check if the word was not found at all in the corpus
# Confidence level 0.0
if spell.word_usage_frequency(str(val)) == 0.0:
word = spell.correction(val)
# Capitalize the corrected word if the original word is capitalized (not an acronym)
if str(val)[0].isupper() and not str(val).isupper() and not word == None:
word = word.capitalize()
# Uppercase the corrected word if the original word is uppercase
elif str(val).isupper() and not word == None:
word = word.upper()
# If the correction is None, call the Viterbi segmentation
if word == None:
test = call_viterbi_segment(
str(val), "combined_big_text.txt", 0.0)
if test != None:
line = line.replace(str(val), str(test))
continue
else:
continue
# Replace the original word with the corrected word
line = line.replace(str(val), str(word))
else:
continue
return line
def unite_sign(line):
"""
Unites two words split by a hyphen if the first word ends with an alphabetic character and the second word starts with a numeric character.
Args:
line (str): The input line to be processed.
Returns:
str: The processed line where two words separated by a hyphen are joined if they meet the specified conditions.
Example:
>>> unite_sign("This is an example-5 test.")
'This is an example-5 test.'
>>> unite_sign("This is a test-case")
'This is a test-case'
"""
# Initialize an empty list to store the processed words
processed_words = []
jump = False
for idx, val in enumerate(line.split()):
if jump == True:
jump = False
continue
if "-" not in val:
# If the word does not contain a negative sign, add it to the processed_words list
processed_words.append(val)
continue
elif val[int(str(val).find("-")) - 1].isalpha():
try:
# Check if the character following the negative sign is numeric
if val[int(str(val).find("-")) + 2].isnumeric():
processed_words.append(val)
continue
except:
pass
val = val.replace("-", "")
# print(val)
try:
# Get the next word from the list
to_add = line.split()[idx + 1]
# print(to_add)
except:
# Add the current word to the processed_words list and continue if an exception occurs
processed_words.append(val)
continue
# Concatenate the current word and the next word
new_word = val + to_add
# Add the new word to the processed_words list
processed_words.append(new_word)
# line = line.replace(str(line.split()[idx+1]), "")
jump = True
continue
else:
# If the negative sign is not within a word, add the word to the processed_words list
processed_words.append(val)
continue
# Join the processed_words list into a single string
processed_words = " ".join(processed_words)
return processed_words
def call_viterbi_segment(line, dict, conf):
"""
Segments a given text into words using the Viterbi algorithm and returns the segmented text.
Args:
line (str): The text to be segmented.
dictionary (dict): A dictionary of words and their frequencies.
conf (float): The confidence level to be used as reference.
Returns:
str: The segmented text.
float: The confidence level of the segmentation.
"""
def viterbi_segment(text):
"""
Uses the Viterbi algorithm to segment the given text into words.
Args:
text (str): The text to be segmented.
word_probs (dict): A dictionary of word probabilities.
max_word_length (int): The maximum length of a word in the dictionary.
Returns:
list: The segmented words.
float: The confidence level of the segmentation.
"""
probs, segmentation_points = [1.0], [0]
for i in range(1, len(text) + 1):
prob_k, k = max((probs[j] * word_prob(text[j:i]), j)
for j in range(max(0, i - max_word_length), i))
probs.append(prob_k)
segmentation_points.append(k)
words = []
i = len(text)
while 0 < i:
words.append(text[segmentation_points[i]:i])
i = segmentation_points[i]
words.reverse()
# confidence level
if probs[-1] == float(conf):
return None
return words, probs[-1]
def word_prob(word):
return dictionary[word] / total
def words(text):
return re.findall('[a-z]+', text.lower())
dictionary = Counter(
words(open('combined_big_text.txt', encoding="utf-8").read()))
max_word_length = max(map(len, dictionary))
sum = 0
for item in dictionary.values():
sum = sum + item
total = float(sum)
output = viterbi_segment(line)
try:
output = " ".join(output[0])
except:
return None
return output