Skip to content

Commit

Permalink
Merge pull request #15 from mammothb/dev
Browse files Browse the repository at this point in the history
v6.3.3 implemented word_segmentation
  • Loading branch information
mammothb authored Dec 5, 2018
2 parents f157013 + 20bbb13 commit baeeb47
Show file tree
Hide file tree
Showing 6 changed files with 225 additions and 19 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
CHANGELOG <br>
==============

## 6.3.3 (2018-12-05)
---------------------
- Added `word_segmentation()` feature

## 6.3.2 (2018-10-23)
---------------------
- Added `encoding` option to `load_dictionary()`
Expand Down
48 changes: 47 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ project_dir
\-project.py
```

### Sample usage
### Sample usage (`lookup` and `lookup_compound`)
Using ``project.py`` (code is more verbose than required to allow explanation of method arguments)
```python
import os
Expand Down Expand Up @@ -77,3 +77,49 @@ if __name__ == "__main__":
##### Expected output:
``members, 226656153, 1``<br><br>
``where is the love he had dated for much of the past who couldn't read in six grade and inspired him, 300000, 10``

### Sample usage (`word_segmentation`)
Using ``project.py`` (code is more verbose than required to allow explanation of method arguments)
```python
import os

from symspellpy.symspellpy import SymSpell, Verbosity # import the module

def main():
edit_distance_max = 0
prefix_length = 7
sym_spell = SymSpell(83000, edit_distance_max, prefix_length)
sym_spell.load_dictionary(dictionary_path, 0, 1)

typo = "thequickbrownfoxjumpsoverthelazydog"
correction = "the quick brown fox jumps over the lazy dog"
result = sym_spell.word_segmentation(typo)
# create object
initial_capacity = 83000
# maximum edit distance per dictionary precalculation
max_edit_distance_dictionary = 0
prefix_length = 7
sym_spell = SymSpell(initial_capacity, max_edit_distance_dictionary,
prefix_length)
# load dictionary
dictionary_path = os.path.join(os.path.dirname(__file__),
"frequency_dictionary_en_82_765.txt")
term_index = 0 # column of the term in the dictionary text file
count_index = 1 # column of the term frequency in the dictionary text file
if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
print("Dictionary file not found")
return

# a sentence without any spaces
input_term = "thequickbrownfoxjumpsoverthelazydog"

result = sym_spell.word_segmentation(input_term)
# display suggestion term, term frequency, and edit distance
print("{}, {}, {}".format(result.corrected_string, result.distance_sum,
result.log_prob_sum))

if __name__ == "__main__":
main()
```
##### Expected output:
``the quick brown fox jumps over the lazy dog 8 -34.491167981910635``
2 changes: 1 addition & 1 deletion symspellpy/__version__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
__title__ = "symspellpy"
__description__ = 'Python SymSpell'
__url__ = "https://github.com/mammothb/symspellpy"
__version__ = "6.3.2"
__version__ = "6.3.3"
__author__ = "mmb L"
__author_email__ = "[email protected]"
__license__ = "MIT"
2 changes: 1 addition & 1 deletion symspellpy/editdistance.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def _distance_max(self, string_1, string_2, len_1, len_2, start, max_distance,
from: https://github.com/softwx/SoftWx.Match
"""
char_1_costs = np.asarray([j + 1 if j < max_distance
else max_distance + 1 for j in range(len_2)])
else max_distance + 1 for j in range(len_2)])
len_diff = len_2 - len_1
j_start_offset = max_distance - len_diff
j_start = 0
Expand Down
158 changes: 142 additions & 16 deletions symspellpy/symspellpy.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from collections import defaultdict
from collections import defaultdict, namedtuple
from enum import Enum
from os import path
from itertools import cycle
import math
import os.path
import sys

from symspellpy.editdistance import DistanceAlgorithm, EditDistance
Expand Down Expand Up @@ -126,9 +128,9 @@ def create_dictionary_entry(self, key, count):
self._max_length = len(key)

# create deletes
edits = self.edits_prefix(key)
edits = self._edits_prefix(key)
for delete in edits:
delete_hash = self.get_str_hash(delete)
delete_hash = self._get_str_hash(delete)
self._deletes[delete_hash].append(key)
return True

Expand All @@ -145,7 +147,7 @@ def load_dictionary(self, corpus, term_index, count_index, encoding=None):
Return:
True if file loaded, or False if file not found.
"""
if not path.exists(corpus):
if not os.path.exists(corpus):
return False
with open(corpus, "r", encoding=encoding) as infile:
for line in infile:
Expand Down Expand Up @@ -238,8 +240,8 @@ def early_exit():
continue
break

if self.get_str_hash(candidate) in self._deletes:
dict_suggestions = self._deletes[self.get_str_hash(candidate)]
if self._get_str_hash(candidate) in self._deletes:
dict_suggestions = self._deletes[self._get_str_hash(candidate)]
for suggestion in dict_suggestions:
if suggestion == phrase:
continue
Expand Down Expand Up @@ -320,7 +322,7 @@ def early_exit():
# delete_in_suggestion_prefix is somewhat expensive,
# and only pays off when verbosity is TOP or CLOSEST
if ((verbosity != Verbosity.ALL
and not self.delete_in_suggestion_prefix(
and not self._delete_in_suggestion_prefix(
candidate, candidate_len, suggestion,
suggestion_len))
or suggestion in considered_suggestions):
Expand Down Expand Up @@ -428,7 +430,7 @@ def lookup_compound(self, phrase, max_edit_distance,
else:
best_2 = SuggestItem(term_list_1[i],
max_edit_distance + 1, 0)
# make sure we're comparing with the lowercase form of the
# make sure we're comparing with the lowercase form of the
# previous word
distance_1 = distance_comparer.compare(
term_list_1[i - 1] + " " + term_list_1[i],
Expand Down Expand Up @@ -513,8 +515,128 @@ def lookup_compound(self, phrase, max_edit_distance,
suggestions_line.append(suggestion)
return suggestions_line

def delete_in_suggestion_prefix(self, delete, delete_len, suggestion,
suggestion_len):
def word_segmentation(self, phrase, max_edit_distance=None,
max_segmentation_word_length=None):
"""word_egmentation divides a string into words by inserting missing
spaces at the appropriate positions misspelled words are corrected
and do not affect segmentation existing spaces are allowed and
considered for optimum segmentation
word_segmentation uses a novel approach *without* recursion.
https://medium.com/@wolfgarbe/fast-word-segmentation-for-noisy-text-2c2c41f9e8da
While each string of length n can be segmented in 2^n−1 possible
compositions https://en.wikipedia.org/wiki/Composition_(combinatorics)
word_segmentation has a linear runtime O(n) to find the optimum
composition
Find suggested spellings for a multi-word input string (supports word
splitting/merging).
Keyword arguments:
phrase -- The string being spell checked.
max_segmentation_word_length -- The maximum word length that should
be considered.
max_edit_distance -- The maximum edit distance between input and
corrected words (0=no correction/segmentation only).
Return:
The word segmented string, the word segmented and spelling corrected
string, the Edit distance sum between input string and corrected
string, the Sum of word occurence probabilities in log scale (a
measure of how common and probable the corrected segmentation is).
"""
# number of all words in the corpus used to generate the frequency
# dictionary. This is used to calculate the word occurrence
# probability p from word counts c : p=c/N. N equals the sum of all
# counts c in the dictionary only if the dictionary is complete, but
# not if the dictionary is truncated or filtered
N = 1024908267229
if max_edit_distance is None:
max_edit_distance = self._max_dictionary_edit_distance
if max_segmentation_word_length is None:
max_segmentation_word_length = self._max_length
array_size = min(max_segmentation_word_length, len(phrase))
compositions = [Composition()] * array_size
circular_index = cycle(range(array_size))
idx = -1

# outer loop (column): all possible part start positions
for j in range(len(phrase)):
# inner loop (row): all possible part lengths (from start
# position): part can't be bigger than longest word in dictionary
# (other than long unknown word)
imax = min(len(phrase) - j, max_segmentation_word_length)
for i in range(1, imax + 1):
# get top spelling correction/ed for part
part = phrase[j : j + i]
separator_len = 0
top_ed = 0
top_log_prob = 0.0
top_result = ""

if part[0].isspace():
# remove space for levensthein calculation
part = part[1 :]
else:
# add ed+1: space did not exist, had to be inserted
separator_len = 1

# remove space from part1, add number of removed spaces to top_ed
top_ed += len(part)
# remove space.
# add number of removed spaces to ed
part = part.replace(" ", "")
top_ed -= len(part)

results = self.lookup(part, Verbosity.TOP, max_edit_distance)
if results:
top_result = results[0].term
top_ed += results[0].distance
# Naive Bayes Rule. We assume the word probabilities of
# two words to be independent. Therefore the resulting
# probability of the word combination is the product of
# the two word probabilities
# Instead of computing the product of probabilities we
# are computing the sum of the logarithm of probabilities
# because the probabilities of words are about 10^-10,
# the product of many such small numbers could exceed
# (underflow) the floating number range and become zero
# log(ab)=log(a)+log(b)
top_log_prob = math.log10(float(results[0].count) /
float(N))
else:
top_result = part
# default, if word not found. otherwise long input text
# would win as long unknown word (with ed=edmax+1),
# although there there should many spaces inserted
top_ed += len(part)
top_log_prob = math.log10(10.0 / N /
math.pow(10.0, len(part)))

dest = (i + idx) % array_size
# set values in first loop
if j == 0:
compositions[dest] = Composition(part, top_result, top_ed,
top_log_prob)
# pylint: disable=C0301,R0916
elif (i == max_segmentation_word_length
# replace values if better probabilityLogSum, if same
# edit distance OR one space difference
or ((compositions[idx].distance_sum + top_ed == compositions[dest].distance_sum
or compositions[idx].distance_sum + separator_len + top_ed == compositions[dest].distance_sum)
and compositions[dest].log_prob_sum < compositions[idx].log_prob_sum + top_log_prob)
# replace values if smaller edit distance
or compositions[idx].distance_sum + separator_len + top_ed < compositions[dest].distance_sum):
compositions[dest] = Composition(
compositions[idx].segmented_string + " " + part,
compositions[idx].corrected_string + " " + top_result,
compositions[idx].distance_sum + separator_len + top_ed,
compositions[idx].log_prob_sum + top_log_prob)
idx = next(circular_index)
return compositions[idx]

def _delete_in_suggestion_prefix(self, delete, delete_len, suggestion,
suggestion_len):
"""check whether all delete chars are present in the suggestion
prefix in correct order, otherwise this is just a hash collision
"""
Expand All @@ -531,7 +653,7 @@ def delete_in_suggestion_prefix(self, delete, delete_len, suggestion,
return False
return True

def edits(self, word, edit_distance, delete_words):
def _edits(self, word, edit_distance, delete_words):
"""inexpensive and language independent: only deletes,
no transposes + replaces + inserts replaces and inserts are expensive
and language dependent
Expand All @@ -544,19 +666,19 @@ def edits(self, word, edit_distance, delete_words):
delete_words.add(delete)
# recursion, if maximum edit distance not yet reached
if edit_distance < self._max_dictionary_edit_distance:
self.edits(delete, edit_distance, delete_words)
self._edits(delete, edit_distance, delete_words)
return delete_words

def edits_prefix(self, key):
def _edits_prefix(self, key):
hash_set = set()
if len(key) <= self._max_dictionary_edit_distance:
hash_set.add("")
if len(key) > self._max_dictionary_edit_distance:
key = key[: self._prefix_length]
hash_set.add(key)
return self.edits(key, 0, hash_set)
return self._edits(key, 0, hash_set)

def get_str_hash(self, s):
def _get_str_hash(self, s):
s_len = len(s)
mask_len = min(s_len, 3)

Expand Down Expand Up @@ -635,3 +757,7 @@ def count(self):
@count.setter
def count(self, count):
self._count = count

Composition = namedtuple("Composition", ["segmented_string", "corrected_string",
"distance_sum", "log_prob_sum"])
Composition.__new__.__defaults__ = (None,) * len(Composition._fields)
30 changes: 30 additions & 0 deletions test/test_symspellpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ def runTest(self):
self.test_lookup_compound()
self.test_lookup_compound_ignore_non_words()
self.test_load_dictionary_encoding()
self.test_word_segmentation()

def test_words_with_shared_prefix_should_retain_counts(self):
print(' - %s' % inspect.stack()[0][3])
Expand Down Expand Up @@ -284,6 +285,35 @@ def test_load_dictionary_encoding(self):
self.assertEqual(1, len(result))
self.assertEqual("АБИ", result[0].term)

def test_word_segmentation(self):
print(' - %s' % inspect.stack()[0][3])
cwd = path.realpath(path.dirname(__file__))
dictionary_path = path.realpath(path.join(
cwd, pardir, "symspellpy", "frequency_dictionary_en_82_765.txt"))

edit_distance_max = 0
prefix_length = 7
sym_spell = SymSpell(83000, edit_distance_max, prefix_length)
sym_spell.load_dictionary(dictionary_path, 0, 1)

typo = "thequickbrownfoxjumpsoverthelazydog"
correction = "the quick brown fox jumps over the lazy dog"
result = sym_spell.word_segmentation(typo)
self.assertEqual(correction, result.corrected_string)

typo = "itwasabrightcolddayinaprilandtheclockswerestrikingthirteen"
correction = ("it was a bright cold day in april and the clocks "
"were striking thirteen")
result = sym_spell.word_segmentation(typo)
self.assertEqual(correction, result[1])

typo = ("itwasthebestoftimesitwastheworstoftimesitwastheageofwisdom"
"itwastheageoffoolishness")
correction = ("it was the best of times it was the worst of times "
"it was the age of wisdom it was the age of foolishness")
result = sym_spell.word_segmentation(typo)
self.assertEqual(correction, result[1])

if __name__ == "__main__":
runner = unittest.TextTestRunner()
suite = unittest.TestSuite()
Expand Down

0 comments on commit baeeb47

Please sign in to comment.