Merge branch 'epic/dict-breaker' into chore/merge-master-into-dict-br…

…eaker
keymanapp · Dec 5, 2024 · 0d4be5f · 0d4be5f
2 parents 1fd2a1b + e22be7f
commit 0d4be5f
Show file tree

Hide file tree

Showing 8 changed files with 706 additions and 2 deletions.
diff --git a/web/src/engine/predictive-text/wordbreakers/README.md b/web/src/engine/predictive-text/wordbreakers/README.md
@@ -17,3 +17,5 @@ const breakWords = wordBreakers['default'];
 console.log(breakWords('Hello, World!').map(span => span.text));
 // prints: [ 'Hello', ',', 'World', '!' ]
 ```
+
+## TODO: dict-breakers
diff --git a/web/src/engine/predictive-text/wordbreakers/build.sh b/web/src/engine/predictive-text/wordbreakers/build.sh
@@ -15,6 +15,7 @@ THIS_SCRIPT="$(readlink -f "${BASH_SOURCE[0]}")"
 # Note:  the raw text files used for data.inc.ts are found within
 # /resources/standards-data/unicode-character-database.
 builder_describe "Builds the predictive-text wordbreaker implementation module" \
+  "@../templates test" \
   "clean" \
   "configure" \
   "build" \

diff --git a/web/src/engine/predictive-text/wordbreakers/src/main/default/index.ts b/web/src/engine/predictive-text/wordbreakers/src/main/default/index.ts
@@ -67,7 +67,7 @@ export default function default_(text: string, options?: DefaultWordBreakerOptio
 /**
  * A span that does not cut out the substring until it absolutely has to!
  */
-class LazySpan implements LexicalModelTypes.Span {
+export class LazySpan implements LexicalModelTypes.Span {
   private _source: string;
   readonly start: number;
   readonly end: number;

diff --git a/web/src/engine/predictive-text/wordbreakers/src/main/dict/index.ts b/web/src/engine/predictive-text/wordbreakers/src/main/dict/index.ts
@@ -0,0 +1,329 @@
+import { LazySpan } from "../default/index.js";
+import { Span, LexiconTraversal } from "@keymanapp/common-types";
+
+// Based on the MIN_KEYSTROKE_PROBABILITY penalty used by the lm-worker.
+const CHAR_SKIP_PENALTY = -Math.log2(.0001);
+
+// const DEFAULT_PARAMS = {
+
+// }
+
+export function splitOnWhitespace(text: string): Span[] {
+  const sections: Span[] = [];
+
+  let start = 0;
+
+  // Surrogate pairs will never overlap \u0020, so we don't need to be
+  // surrogate-pair aware here.
+  text += ' ';
+  for(let index = 0; index < text.length; index++) {
+    const char = text.charAt(index);
+    if(char.match(/\s|\u200c/)) {
+      if(start !== undefined) {
+        sections.push(new LazySpan(text, start, index));
+        start = undefined; // we do not emit whitespace tokens here.
+      }
+    } else if(start === undefined) {
+      start = index;
+    }
+  }
+
+  return sections;
+}
+
+export type DictBreakerPath = {
+  /**
+   * The index of the character immediately before the most recently-available word boundary.
+   * Is set to -1 if no such boundary exists.
+   */
+  boundaryIndex: number;
+
+  // Could add a 'reference' if we create objects for each char in the context - such as for
+  // caching & reusing boundary info with future inputs.
+
+  /**
+   * An active traversal representing potential words that may become completed, starting
+   * immediately after the boundary indicated by `boundaryIndex`.
+   */
+  traversal: LexiconTraversal;
+
+  /**
+   * cost:  measured in -log(p) of each decision.
+   */
+  cost: number;
+
+  /**
+   * Indicates if this path's most recent traversal enforces a boundary without matching
+   * a word in the lexicon.
+   */
+  wasUnmatchedChar?: boolean;
+
+  /**
+   * The path object used to reach the previous boundary.
+   */
+  parent?: DictBreakerPath;
+}
+
+/**
+ * Provides dictionary-based wordbreaking assuming a LexiconTraversal can be specified for
+ * the dictionary.
+ * @param fullText The full context to be tokenized.
+ * @param dictRoot A LexiconTraversal interface from the active LexicalModel,
+ * allowing efficient dictionary lookups of encountered words.
+ * @returns
+ */
+export default function dict(fullText: string, dictRoot: LexiconTraversal): Span[] {
+  if(!dictRoot) {
+    throw new Error("Cannot use dictionary-based wordbreaker without `LexiconTraversal` dictionary access");
+  }
+
+  // Whenever we have a space or a ZWNJ (U+200C), we'll assume a 100%-confirmed wordbreak
+  // at that location.  We only need to "guess" at anything between 'em.
+  const sections = splitOnWhitespace(fullText);
+  let allSpans: Span[] = [];
+
+  for(const section of sections) {
+    // Technically, this may give us a 'partial' wordbreak at the section's end, which
+    // may be slightly significant for earlier sections.  Probably not worth worrying
+    // about, though.
+    allSpans = allSpans.concat(_dict_break(section, dictRoot));
+  }
+
+  return allSpans;
+}
+
+// Exposed for testing reasons.
+/**
+ * Given a section of text without whitespaces and ZWNJs, uses the active lexical-model's
+ * entries to detect optimal word-breaking locations.
+ * @param span A span representing the section and its position within the context.
+ * @param dictRoot A LexiconTraversal interface from the active LexicalModel,
+ * allowing efficient dictionary lookups of encountered words.
+ * @returns An array of `Span`s representing each tokenized word, indexed according to their
+ * location in the section's original context.
+ */
+export function _dict_break(span: Span, dictRoot: LexiconTraversal): Span[] {
+  if(span.length == 0) {
+    return [];
+  }
+
+  const text = span.text;
+  const splitIndex = span.start;
+
+  // 1.  Splay the string into individual codepoints.
+  const codepointArr = [...text];
+
+  // 2.  Initialize tracking vars and prep the loop.
+  let bestBoundingPath: DictBreakerPath = {
+    boundaryIndex: -1,
+    traversal: dictRoot,
+    cost: 0
+  };
+
+  // Optimization TODO:  convert to priority queue?
+  let activePaths: DictBreakerPath[] = [bestBoundingPath];
+
+  // 3. Run the master loop.
+  // 3a. For each codepoint in the string...
+  for(let i=0; i < codepointArr.length; i++) {
+    const codepoint = codepointArr[i];
+    let paths: DictBreakerPath[] = [];
+
+    // 3b. compute all viable paths to continue words & start new ones.
+    for(const path of activePaths) {
+      let traversal = path.traversal.child(codepoint);
+      if(!traversal) {
+        continue;
+      }
+
+      const pathCtd: DictBreakerPath = {
+        boundaryIndex: path.boundaryIndex,
+        traversal: traversal,
+        cost: (path.parent?.cost ?? 0) - Math.log2(traversal.p),
+        parent: path.parent
+      }
+
+      paths.push(pathCtd);
+    }
+
+    // 3c. Find the minimal-cost new path with a word boundary, if any exist.
+    // If the traversal has entries, it's a legal path-end; else it isn't.
+    const boundingPaths = paths.filter((path) => !!path.traversal.entries.length);
+    // If none exist, this is the fallback.
+    const penaltyParent: DictBreakerPath = {
+      boundaryIndex: i-1,  // successor will cover one codepoint
+      traversal: dictRoot.child(codepoint), // no `entries`, but... it's fine.
+      // bestBoundingPath is currently a root-level traversal.  Its parent corresponds
+      // to the previous token.
+      cost: bestBoundingPath.cost + CHAR_SKIP_PENALTY,
+      parent: bestBoundingPath.parent,
+      wasUnmatchedChar: true
+    };
+
+    boundingPaths.push(penaltyParent);
+    // Sort in cost-ascending order.
+    // As we're using negative log likelihood, smaller is better.
+    // (The closer to log_2(1) = 0, the better.)
+    boundingPaths.sort((a, b) => a.cost - b.cost);
+
+    // We build a new path starting from this specific path; we're modeling a word-end.
+    // If it's the "penalty path", we already built it.
+    const bestBound = boundingPaths[0];
+    const successorPath: DictBreakerPath = {
+      boundaryIndex: i,
+      traversal: dictRoot,
+      cost: bestBound.cost,
+      parent: bestBound
+    }
+
+    bestBoundingPath = successorPath;
+    paths.push(successorPath);
+
+    // 3d. We now shift to the next loop iteration; we use the descendant `paths` set.
+    activePaths = paths;
+  }
+
+  // 4. When all iterations are done, determine the lowest-cost path that
+  //    remains, without regard to if it supports a word-boundary.
+  //
+  // If we happen to end on a potential word-boundary, opt for that one.  If two
+  // match aside from boundaryIndex, take the lesser.  It comes first, BTW, so
+  // stable-sorts auto-resolve this.
+  activePaths.sort((a, b) => (a.cost - b.cost));
+  const winningPath = activePaths[0];
+
+  // 5. Build the spans.
+  const spans: (Span & { codepointLength: number, unmatched: boolean })[] = [];
+  const pathAsArray: DictBreakerPath[] = [];
+
+  let rewindPath = winningPath;
+  while(rewindPath) {
+    const start = rewindPath.boundaryIndex+1;
+    const end = codepointArr.length; // consistent because of the effects from the splice below
+    const text = codepointArr.splice(start, end - start).join('');
+
+    pathAsArray.unshift(rewindPath);
+    spans.unshift({
+      start: start,  // currently in code points; we'll correct it on the next pass.
+      end: end, // same.
+      length: text.length, // Span spec:  in code units
+      text: text,
+      codepointLength: end - start,
+      unmatched: !!rewindPath.wasUnmatchedChar
+    });
+    rewindPath = rewindPath.parent;
+  }
+
+  // 6. Span pass 2 - index finalization.
+  // - Remember, split-index is our offset!
+  // - We currently have codepoint `start` and `end`, but need code-unit values.
+  let totalLength = splitIndex;
+  for(let i = 0; i < spans.length; i++) {
+    const baseSpan = spans[i];
+    const start = totalLength;
+    totalLength += baseSpan.length;
+
+    const trueSpan: typeof spans[0] = {
+      ...baseSpan,
+      start: start,
+      end: totalLength
+    };
+
+    spans[i] = trueSpan;
+  }
+
+  // If all we had was whitespace, hence no spans, return.
+  if(spans.length == 0) {
+    return spans;
+  }
+
+  // 7. Span pass 3:  identify continuous penalty spans.  Why split into separate spans
+  // when we can merge all the bits we can't recognize as a big lump instead?
+  // - Looks nicer in the unit tests, if nothing else.
+  // - Has _far_ better potential for 'learning' down the line.
+
+  let spanBucket: Span[] = [];
+  const finalSpans: Span[] = [];
+
+  function mergeBucket(spanBucket: Span[]) {
+    if(spanBucket.length > 0) {
+      const startSpan = spanBucket[0];
+      const endSpan = spanBucket[spanBucket.length - 1];
+      //
+
+      finalSpans.push({
+        start: startSpan.start,
+        end: endSpan.end,
+        length: endSpan.end - startSpan.start,
+        text: spanBucket.map((entry) => entry.text).join('')
+      });
+    }
+  }
+
+  for(const span of spans) {
+    if(span.codepointLength == 1 && span.unmatched) {
+      spanBucket.push(span);
+    } else {
+      mergeBucket(spanBucket);
+      spanBucket = [];
+      finalSpans.push(span);
+    }
+  }
+
+  mergeBucket(spanBucket);
+
+  // ... and done!
+  return finalSpans;
+
+  /*
+    Important questions:
+    - What is the cheapest way to have a word-break boundary after this character?
+      - This is a 100% valid question; the complications arise in moving from an "earlier"
+        answer to a "later" answer.
+
+    - What words are possible to reach given recent possible boundaries?
+      - idea: keep a running 'tally' of all Traversals that have valid paths at the
+        current processing stage, noting what their starting points were.
+        - matches the approach seen above.
+        - Possible optimization: ... instead of 'tally'... 'priority queue'?
+          - cheapest (start point cost) + (current traversal min cost) makes a good
+            A* heuristic.
+            - valid heuristic - traversal min-cost will never overestimate.
+            - would likely avoid a need to search expensive branches this way.
+              - current correction considers fat-finger prob * correction cost.
+              - lexical probability only factors in 100%-after corrections, as a
+                final step, at present, hence why it's not currently available.
+        - if no longer valid, drop it from the 'tally' / the 'running'.
+        - after each codepoint, we always add a newly-started traversal.
+          - worst-case, it comes with a set penalty cost added to the previous
+            codepoint's cost.
+        - if a current traversal has entries, we have a direct candidate for best
+          cost at this location.
+          - if using the priority queue strat, we may need to process just enough entries
+            to where the next node is equal or higher cost to the selected entry.
+            - unless the cost crossed to reach it IS that cost.
+          - If multiple co-located entries, use the best cost of them all.  They all
+            search-key to each other, anyway, so whatever is best is still "valid enough"
+            to trigger a boundary.
+
+    So...
+
+    O(N), N = length of string:  loop over each codepoint
+      O(N [worst-case]): loop over each still-valid traversal
+        - at most, matches the index of the current codepoint
+        - in practice, will be NOTABLY fewer after the first few codepoints.
+        O(1): check if the Traversal can continue with the new incoming char.
+  */
+
+  // 2.  Initial state:  one traversal @ root pointing to 'no ancestor'.
+  // 2b. Could prep to build a 'memo' of calcs reusable by later runs?
+  //     - may be best to note "still-valid start locations at this node"
+  // 3.  Run the boundary search - see the approx looping structure noted above.
+  // 4.  Best answer at the end wins!
+  // 5.  May be worth persisting a cache of recent memos, do a quick diff on the
+  //     most recent as a step 0 in future runs, to reuse data?
+  //     - but... how to clear that cache on model change?
+  //       - duh!  validate the passed-in root traversal.  If unequal, is diff model.  ezpz.
+
+  // return [];
+}
diff --git a/web/src/engine/predictive-text/wordbreakers/src/main/index.ts b/web/src/engine/predictive-text/wordbreakers/src/main/index.ts
@@ -1,5 +1,6 @@
 import placeholder from "./placeholder.js";
 import ascii from "./ascii.js";
+import dict from "./dict/index.js";
 import default_ from "./default/index.js";
 
-export { placeholder, ascii, default_ as default, default_ as defaultWordbreaker };
+export { placeholder, ascii, default_ as default, default_ as defaultWordbreaker, dict };