Merge pull request #15 from mammothb/dev

v6.3.3 implemented word_segmentation
mammothb · Dec 5, 2018 · baeeb47 · baeeb47
2 parents f157013 + 20bbb13
commit baeeb47
Show file tree

Hide file tree

Showing 6 changed files with 225 additions and 19 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,10 @@
 CHANGELOG <br>
 ==============
 
+## 6.3.3 (2018-12-05)
+---------------------
+- Added `word_segmentation()` feature
+
 ## 6.3.2 (2018-10-23)
 ---------------------
 - Added `encoding` option to `load_dictionary()`

diff --git a/README.md b/README.md
@@ -21,7 +21,7 @@ project_dir
   \-project.py
 ```
 
-### Sample usage
+### Sample usage (`lookup` and `lookup_compound`)
 Using ``project.py`` (code is more verbose than required to allow explanation of method arguments)
 ```python
 import os
@@ -77,3 +77,49 @@ if __name__ == "__main__":
 ##### Expected output:
 ``members, 226656153, 1``<br><br>
 ``where is the love he had dated for much of the past who couldn't read in six grade and inspired him, 300000, 10``
+
+### Sample usage (`word_segmentation`)
+Using ``project.py`` (code is more verbose than required to allow explanation of method arguments)
+```python
+import os
+
+from symspellpy.symspellpy import SymSpell, Verbosity  # import the module
+
+def main():
+      edit_distance_max = 0
+      prefix_length = 7
+      sym_spell = SymSpell(83000, edit_distance_max, prefix_length)
+      sym_spell.load_dictionary(dictionary_path, 0, 1)
+
+      typo = "thequickbrownfoxjumpsoverthelazydog"
+      correction = "the quick brown fox jumps over the lazy dog"
+      result = sym_spell.word_segmentation(typo)
+    # create object
+    initial_capacity = 83000
+    # maximum edit distance per dictionary precalculation
+    max_edit_distance_dictionary = 0
+    prefix_length = 7
+    sym_spell = SymSpell(initial_capacity, max_edit_distance_dictionary,
+                         prefix_length)
+    # load dictionary
+    dictionary_path = os.path.join(os.path.dirname(__file__),
+                                   "frequency_dictionary_en_82_765.txt")
+    term_index = 0  # column of the term in the dictionary text file
+    count_index = 1  # column of the term frequency in the dictionary text file
+    if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
+        print("Dictionary file not found")
+        return
+
+    # a sentence without any spaces
+    input_term = "thequickbrownfoxjumpsoverthelazydog"
+
+    result = sym_spell.word_segmentation(input_term)
+    # display suggestion term, term frequency, and edit distance
+    print("{}, {}, {}".format(result.corrected_string, result.distance_sum,
+                              result.log_prob_sum))
+
+if __name__ == "__main__":
+    main()
+```
+##### Expected output:
+``the quick brown fox jumps over the lazy dog 8 -34.491167981910635``
diff --git a/symspellpy/__version__.py b/symspellpy/__version__.py
@@ -1,7 +1,7 @@
 __title__ = "symspellpy"
 __description__ = 'Python SymSpell'
 __url__ = "https://github.com/mammothb/symspellpy"
-__version__ = "6.3.2"
+__version__ = "6.3.3"
 __author__ = "mmb L"
 __author_email__ = "[email protected]"
 __license__ = "MIT"
diff --git a/symspellpy/editdistance.py b/symspellpy/editdistance.py
@@ -132,7 +132,7 @@ def _distance_max(self, string_1, string_2, len_1, len_2, start, max_distance,
         from: https://github.com/softwx/SoftWx.Match
         """
         char_1_costs = np.asarray([j + 1 if j < max_distance
-                                  else max_distance + 1 for j in range(len_2)])
+                                   else max_distance + 1 for j in range(len_2)])
         len_diff = len_2 - len_1
         j_start_offset = max_distance - len_diff
         j_start = 0

diff --git a/symspellpy/symspellpy.py b/symspellpy/symspellpy.py
@@ -1,6 +1,8 @@
-from collections import defaultdict
+from collections import defaultdict, namedtuple
 from enum import Enum
-from os import path
+from itertools import cycle
+import math
+import os.path
 import sys
 
 from symspellpy.editdistance import DistanceAlgorithm, EditDistance
@@ -126,9 +128,9 @@ def create_dictionary_entry(self, key, count):
             self._max_length = len(key)
 
         # create deletes
-        edits = self.edits_prefix(key)
+        edits = self._edits_prefix(key)
         for delete in edits:
-            delete_hash = self.get_str_hash(delete)
+            delete_hash = self._get_str_hash(delete)
             self._deletes[delete_hash].append(key)
         return True
 
@@ -145,7 +147,7 @@ def load_dictionary(self, corpus, term_index, count_index, encoding=None):
         Return:
         True if file loaded, or False if file not found.
         """
-        if not path.exists(corpus):
+        if not os.path.exists(corpus):
             return False
         with open(corpus, "r", encoding=encoding) as infile:
             for line in infile:
@@ -238,8 +240,8 @@ def early_exit():
                     continue
                 break
 
-            if self.get_str_hash(candidate) in self._deletes:
-                dict_suggestions = self._deletes[self.get_str_hash(candidate)]
+            if self._get_str_hash(candidate) in self._deletes:
+                dict_suggestions = self._deletes[self._get_str_hash(candidate)]
                 for suggestion in dict_suggestions:
                     if suggestion == phrase:
                         continue
@@ -320,7 +322,7 @@ def early_exit():
                             # delete_in_suggestion_prefix is somewhat expensive,
                             # and only pays off when verbosity is TOP or CLOSEST
                             if ((verbosity != Verbosity.ALL
-                                 and not self.delete_in_suggestion_prefix(
+                                 and not self._delete_in_suggestion_prefix(
                                      candidate, candidate_len, suggestion,
                                      suggestion_len))
                                     or suggestion in considered_suggestions):
@@ -428,7 +430,7 @@ def lookup_compound(self, phrase, max_edit_distance,
                     else:
                         best_2 = SuggestItem(term_list_1[i],
                                              max_edit_distance + 1, 0)
-                    # make sure we're comparing with the lowercase form of the 
+                    # make sure we're comparing with the lowercase form of the
                     # previous word
                     distance_1 = distance_comparer.compare(
                         term_list_1[i - 1] + " " + term_list_1[i],
@@ -513,8 +515,128 @@ def lookup_compound(self, phrase, max_edit_distance,
         suggestions_line.append(suggestion)
         return suggestions_line
 
-    def delete_in_suggestion_prefix(self, delete, delete_len, suggestion,
-                                    suggestion_len):
+    def word_segmentation(self, phrase, max_edit_distance=None,
+                          max_segmentation_word_length=None):
+        """word_egmentation divides a string into words by inserting missing
+        spaces at the appropriate positions misspelled words are corrected
+        and do not affect segmentation existing spaces are allowed and
+        considered for optimum segmentation
+
+        word_segmentation uses a novel approach *without* recursion.
+        https://medium.com/@wolfgarbe/fast-word-segmentation-for-noisy-text-2c2c41f9e8da
+        While each string of length n can be segmented in 2^n−1 possible
+        compositions https://en.wikipedia.org/wiki/Composition_(combinatorics)
+        word_segmentation has a linear runtime O(n) to find the optimum
+        composition
+
+        Find suggested spellings for a multi-word input string (supports word
+        splitting/merging).
+
+        Keyword arguments:
+        phrase -- The string being spell checked.
+        max_segmentation_word_length -- The maximum word length that should
+            be considered.
+        max_edit_distance -- The maximum edit distance between input and
+            corrected words (0=no correction/segmentation only).
+
+        Return:
+        The word segmented string, the word segmented and spelling corrected
+        string, the Edit distance sum between input string and corrected
+        string, the Sum of word occurence probabilities in log scale (a
+        measure of how common and probable the corrected segmentation is).
+        """
+        # number of all words in the corpus used to generate the frequency
+        # dictionary. This is used to calculate the word occurrence
+        # probability p from word counts c : p=c/N. N equals the sum of all
+        # counts c in the dictionary only if the dictionary is complete, but
+        # not if the dictionary is truncated or filtered
+        N = 1024908267229
+        if max_edit_distance is None:
+            max_edit_distance = self._max_dictionary_edit_distance
+        if max_segmentation_word_length is None:
+            max_segmentation_word_length = self._max_length
+        array_size = min(max_segmentation_word_length, len(phrase))
+        compositions = [Composition()] * array_size
+        circular_index = cycle(range(array_size))
+        idx = -1
+
+        # outer loop (column): all possible part start positions
+        for j in range(len(phrase)):
+            # inner loop (row): all possible part lengths (from start
+            # position): part can't be bigger than longest word in dictionary
+            # (other than long unknown word)
+            imax = min(len(phrase) - j, max_segmentation_word_length)
+            for i in range(1, imax + 1):
+                # get top spelling correction/ed for part
+                part = phrase[j : j + i]
+                separator_len = 0
+                top_ed = 0
+                top_log_prob = 0.0
+                top_result = ""
+
+                if part[0].isspace():
+                    # remove space for levensthein calculation
+                    part = part[1 :]
+                else:
+                    # add ed+1: space did not exist, had to be inserted
+                    separator_len = 1
+
+                # remove space from part1, add number of removed spaces to top_ed
+                top_ed += len(part)
+                # remove space.
+                # add number of removed spaces to ed
+                part = part.replace(" ", "")
+                top_ed -= len(part)
+
+                results = self.lookup(part, Verbosity.TOP, max_edit_distance)
+                if results:
+                    top_result = results[0].term
+                    top_ed += results[0].distance
+                    # Naive Bayes Rule. We assume the word probabilities of
+                    # two words to be independent. Therefore the resulting
+                    # probability of the word combination is the product of
+                    # the two word probabilities
+                    # Instead of computing the product of probabilities we
+                    # are computing the sum of the logarithm of probabilities
+                    # because the probabilities of words are about 10^-10,
+                    # the product of many such small numbers could exceed
+                    # (underflow) the floating number range and become zero
+                    # log(ab)=log(a)+log(b)
+                    top_log_prob = math.log10(float(results[0].count) /
+                                              float(N))
+                else:
+                    top_result = part
+                    # default, if word not found. otherwise long input text
+                    # would win as long unknown word (with ed=edmax+1),
+                    # although there there should many spaces inserted
+                    top_ed += len(part)
+                    top_log_prob = math.log10(10.0 / N /
+                                              math.pow(10.0, len(part)))
+
+                dest = (i + idx) % array_size
+                # set values in first loop
+                if j == 0:
+                    compositions[dest] = Composition(part, top_result, top_ed,
+                                                     top_log_prob)
+                # pylint: disable=C0301,R0916
+                elif (i == max_segmentation_word_length
+                      # replace values if better probabilityLogSum, if same
+                      # edit distance OR one space difference
+                      or ((compositions[idx].distance_sum + top_ed == compositions[dest].distance_sum
+                           or compositions[idx].distance_sum + separator_len + top_ed == compositions[dest].distance_sum)
+                          and compositions[dest].log_prob_sum < compositions[idx].log_prob_sum + top_log_prob)
+                      # replace values if smaller edit distance
+                      or compositions[idx].distance_sum + separator_len + top_ed < compositions[dest].distance_sum):
+                    compositions[dest] = Composition(
+                        compositions[idx].segmented_string + " " + part,
+                        compositions[idx].corrected_string + " " + top_result,
+                        compositions[idx].distance_sum + separator_len + top_ed,
+                        compositions[idx].log_prob_sum + top_log_prob)
+            idx = next(circular_index)
+        return compositions[idx]
+
+    def _delete_in_suggestion_prefix(self, delete, delete_len, suggestion,
+                                     suggestion_len):
         """check whether all delete chars are present in the suggestion
         prefix in correct order, otherwise this is just a hash collision
         """
@@ -531,7 +653,7 @@ def delete_in_suggestion_prefix(self, delete, delete_len, suggestion,
                 return False
         return True
 
-    def edits(self, word, edit_distance, delete_words):
+    def _edits(self, word, edit_distance, delete_words):
         """inexpensive and language independent: only deletes,
         no transposes + replaces + inserts replaces and inserts are expensive
         and language dependent
@@ -544,19 +666,19 @@ def edits(self, word, edit_distance, delete_words):
                     delete_words.add(delete)
                     # recursion, if maximum edit distance not yet reached
                     if edit_distance < self._max_dictionary_edit_distance:
-                        self.edits(delete, edit_distance, delete_words)
+                        self._edits(delete, edit_distance, delete_words)
         return delete_words
 
-    def edits_prefix(self, key):
+    def _edits_prefix(self, key):
         hash_set = set()
         if len(key) <= self._max_dictionary_edit_distance:
             hash_set.add("")
         if len(key) > self._max_dictionary_edit_distance:
             key = key[: self._prefix_length]
         hash_set.add(key)
-        return self.edits(key, 0, hash_set)
+        return self._edits(key, 0, hash_set)
 
-    def get_str_hash(self, s):
+    def _get_str_hash(self, s):
         s_len = len(s)
         mask_len = min(s_len, 3)
 
@@ -635,3 +757,7 @@ def count(self):
     @count.setter
     def count(self, count):
         self._count = count
+
+Composition = namedtuple("Composition", ["segmented_string", "corrected_string",
+                                         "distance_sum", "log_prob_sum"])
+Composition.__new__.__defaults__ = (None,) * len(Composition._fields)
diff --git a/test/test_symspellpy.py b/test/test_symspellpy.py
@@ -22,6 +22,7 @@ def runTest(self):
         self.test_lookup_compound()
         self.test_lookup_compound_ignore_non_words()
         self.test_load_dictionary_encoding()
+        self.test_word_segmentation()
 
     def test_words_with_shared_prefix_should_retain_counts(self):
         print('  - %s' % inspect.stack()[0][3])
@@ -284,6 +285,35 @@ def test_load_dictionary_encoding(self):
         self.assertEqual(1, len(result))
         self.assertEqual("АБИ", result[0].term)
 
+    def test_word_segmentation(self):
+        print('  - %s' % inspect.stack()[0][3])
+        cwd = path.realpath(path.dirname(__file__))
+        dictionary_path = path.realpath(path.join(
+            cwd, pardir, "symspellpy", "frequency_dictionary_en_82_765.txt"))
+
+        edit_distance_max = 0
+        prefix_length = 7
+        sym_spell = SymSpell(83000, edit_distance_max, prefix_length)
+        sym_spell.load_dictionary(dictionary_path, 0, 1)
+
+        typo = "thequickbrownfoxjumpsoverthelazydog"
+        correction = "the quick brown fox jumps over the lazy dog"
+        result = sym_spell.word_segmentation(typo)
+        self.assertEqual(correction, result.corrected_string)
+
+        typo = "itwasabrightcolddayinaprilandtheclockswerestrikingthirteen"
+        correction = ("it was a bright cold day in april and the clocks "
+                      "were striking thirteen")
+        result = sym_spell.word_segmentation(typo)
+        self.assertEqual(correction, result[1])
+
+        typo = ("itwasthebestoftimesitwastheworstoftimesitwastheageofwisdom"
+                "itwastheageoffoolishness")
+        correction = ("it was the best of times it was the worst of times "
+                      "it was the age of wisdom it was the age of foolishness")
+        result = sym_spell.word_segmentation(typo)
+        self.assertEqual(correction, result[1])
+
 if __name__ == "__main__":
     runner = unittest.TextTestRunner()
     suite = unittest.TestSuite()