Skip to content

Commit

Permalink
Merge branch 'epic/dict-breaker' into chore/merge-master-into-dict-br…
Browse files Browse the repository at this point in the history
…eaker
  • Loading branch information
mcdurdin authored Oct 10, 2024
2 parents 4a67c5c + 367d32e commit 39430bf
Show file tree
Hide file tree
Showing 8 changed files with 706 additions and 2 deletions.
2 changes: 2 additions & 0 deletions web/src/engine/predictive-text/wordbreakers/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,5 @@ const breakWords = wordBreakers['default'];
console.log(breakWords('Hello, World!').map(span => span.text));
// prints: [ 'Hello', ',', 'World', '!' ]
```

## TODO: dict-breakers
1 change: 1 addition & 0 deletions web/src/engine/predictive-text/wordbreakers/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ THIS_SCRIPT="$(readlink -f "${BASH_SOURCE[0]}")"
# Note: the raw text files used for data.inc.ts are found within
# /resources/standards-data/unicode-character-database.
builder_describe "Builds the predictive-text wordbreaker implementation module" \
"@../templates test" \
"clean" \
"configure" \
"build" \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ export default function default_(text: string, options?: DefaultWordBreakerOptio
/**
* A span that does not cut out the substring until it absolutely has to!
*/
class LazySpan implements Span {
export class LazySpan implements Span {
private _source: string;
readonly start: number;
readonly end: number;
Expand Down
329 changes: 329 additions & 0 deletions web/src/engine/predictive-text/wordbreakers/src/main/dict/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,329 @@
import { LazySpan } from "../default/index.js";
import { Span, LexiconTraversal } from "@keymanapp/common-types";

// Based on the MIN_KEYSTROKE_PROBABILITY penalty used by the lm-worker.
const CHAR_SKIP_PENALTY = -Math.log2(.0001);

// const DEFAULT_PARAMS = {

// }

export function splitOnWhitespace(text: string): Span[] {
const sections: Span[] = [];

let start = 0;

// Surrogate pairs will never overlap \u0020, so we don't need to be
// surrogate-pair aware here.
text += ' ';
for(let index = 0; index < text.length; index++) {
const char = text.charAt(index);
if(char.match(/\s|\u200c/)) {
if(start !== undefined) {
sections.push(new LazySpan(text, start, index));
start = undefined; // we do not emit whitespace tokens here.
}
} else if(start === undefined) {
start = index;
}
}

return sections;
}

export type DictBreakerPath = {
/**
* The index of the character immediately before the most recently-available word boundary.
* Is set to -1 if no such boundary exists.
*/
boundaryIndex: number;

// Could add a 'reference' if we create objects for each char in the context - such as for
// caching & reusing boundary info with future inputs.

/**
* An active traversal representing potential words that may become completed, starting
* immediately after the boundary indicated by `boundaryIndex`.
*/
traversal: LexiconTraversal;

/**
* cost: measured in -log(p) of each decision.
*/
cost: number;

/**
* Indicates if this path's most recent traversal enforces a boundary without matching
* a word in the lexicon.
*/
wasUnmatchedChar?: boolean;

/**
* The path object used to reach the previous boundary.
*/
parent?: DictBreakerPath;
}

/**
* Provides dictionary-based wordbreaking assuming a LexiconTraversal can be specified for
* the dictionary.
* @param fullText The full context to be tokenized.
* @param dictRoot A LexiconTraversal interface from the active LexicalModel,
* allowing efficient dictionary lookups of encountered words.
* @returns
*/
export default function dict(fullText: string, dictRoot: LexiconTraversal): Span[] {
if(!dictRoot) {
throw new Error("Cannot use dictionary-based wordbreaker without `LexiconTraversal` dictionary access");
}

// Whenever we have a space or a ZWNJ (U+200C), we'll assume a 100%-confirmed wordbreak
// at that location. We only need to "guess" at anything between 'em.
const sections = splitOnWhitespace(fullText);
let allSpans: Span[] = [];

for(const section of sections) {
// Technically, this may give us a 'partial' wordbreak at the section's end, which
// may be slightly significant for earlier sections. Probably not worth worrying
// about, though.
allSpans = allSpans.concat(_dict_break(section, dictRoot));
}

return allSpans;
}

// Exposed for testing reasons.
/**
* Given a section of text without whitespaces and ZWNJs, uses the active lexical-model's
* entries to detect optimal word-breaking locations.
* @param span A span representing the section and its position within the context.
* @param dictRoot A LexiconTraversal interface from the active LexicalModel,
* allowing efficient dictionary lookups of encountered words.
* @returns An array of `Span`s representing each tokenized word, indexed according to their
* location in the section's original context.
*/
export function _dict_break(span: Span, dictRoot: LexiconTraversal): Span[] {
if(span.length == 0) {
return [];
}

const text = span.text;
const splitIndex = span.start;

// 1. Splay the string into individual codepoints.
const codepointArr = [...text];

// 2. Initialize tracking vars and prep the loop.
let bestBoundingPath: DictBreakerPath = {
boundaryIndex: -1,
traversal: dictRoot,
cost: 0
};

// Optimization TODO: convert to priority queue?
let activePaths: DictBreakerPath[] = [bestBoundingPath];

// 3. Run the master loop.
// 3a. For each codepoint in the string...
for(let i=0; i < codepointArr.length; i++) {
const codepoint = codepointArr[i];
let paths: DictBreakerPath[] = [];

// 3b. compute all viable paths to continue words & start new ones.
for(const path of activePaths) {
let traversal = path.traversal.child(codepoint);
if(!traversal) {
continue;
}

const pathCtd: DictBreakerPath = {
boundaryIndex: path.boundaryIndex,
traversal: traversal,
cost: (path.parent?.cost ?? 0) - Math.log2(traversal.p),
parent: path.parent
}

paths.push(pathCtd);
}

// 3c. Find the minimal-cost new path with a word boundary, if any exist.
// If the traversal has entries, it's a legal path-end; else it isn't.
const boundingPaths = paths.filter((path) => !!path.traversal.entries.length);
// If none exist, this is the fallback.
const penaltyParent: DictBreakerPath = {
boundaryIndex: i-1, // successor will cover one codepoint
traversal: dictRoot.child(codepoint), // no `entries`, but... it's fine.
// bestBoundingPath is currently a root-level traversal. Its parent corresponds
// to the previous token.
cost: bestBoundingPath.cost + CHAR_SKIP_PENALTY,
parent: bestBoundingPath.parent,
wasUnmatchedChar: true
};

boundingPaths.push(penaltyParent);
// Sort in cost-ascending order.
// As we're using negative log likelihood, smaller is better.
// (The closer to log_2(1) = 0, the better.)
boundingPaths.sort((a, b) => a.cost - b.cost);

// We build a new path starting from this specific path; we're modeling a word-end.
// If it's the "penalty path", we already built it.
const bestBound = boundingPaths[0];
const successorPath: DictBreakerPath = {
boundaryIndex: i,
traversal: dictRoot,
cost: bestBound.cost,
parent: bestBound
}

bestBoundingPath = successorPath;
paths.push(successorPath);

// 3d. We now shift to the next loop iteration; we use the descendant `paths` set.
activePaths = paths;
}

// 4. When all iterations are done, determine the lowest-cost path that
// remains, without regard to if it supports a word-boundary.
//
// If we happen to end on a potential word-boundary, opt for that one. If two
// match aside from boundaryIndex, take the lesser. It comes first, BTW, so
// stable-sorts auto-resolve this.
activePaths.sort((a, b) => (a.cost - b.cost));
const winningPath = activePaths[0];

// 5. Build the spans.
const spans: (Span & { codepointLength: number, unmatched: boolean })[] = [];
const pathAsArray: DictBreakerPath[] = [];

let rewindPath = winningPath;
while(rewindPath) {
const start = rewindPath.boundaryIndex+1;
const end = codepointArr.length; // consistent because of the effects from the splice below
const text = codepointArr.splice(start, end - start).join('');

pathAsArray.unshift(rewindPath);
spans.unshift({
start: start, // currently in code points; we'll correct it on the next pass.
end: end, // same.
length: text.length, // Span spec: in code units
text: text,
codepointLength: end - start,
unmatched: !!rewindPath.wasUnmatchedChar
});
rewindPath = rewindPath.parent;
}

// 6. Span pass 2 - index finalization.
// - Remember, split-index is our offset!
// - We currently have codepoint `start` and `end`, but need code-unit values.
let totalLength = splitIndex;
for(let i = 0; i < spans.length; i++) {
const baseSpan = spans[i];
const start = totalLength;
totalLength += baseSpan.length;

const trueSpan: typeof spans[0] = {
...baseSpan,
start: start,
end: totalLength
};

spans[i] = trueSpan;
}

// If all we had was whitespace, hence no spans, return.
if(spans.length == 0) {
return spans;
}

// 7. Span pass 3: identify continuous penalty spans. Why split into separate spans
// when we can merge all the bits we can't recognize as a big lump instead?
// - Looks nicer in the unit tests, if nothing else.
// - Has _far_ better potential for 'learning' down the line.

let spanBucket: Span[] = [];
const finalSpans: Span[] = [];

function mergeBucket(spanBucket: Span[]) {
if(spanBucket.length > 0) {
const startSpan = spanBucket[0];
const endSpan = spanBucket[spanBucket.length - 1];
//

finalSpans.push({
start: startSpan.start,
end: endSpan.end,
length: endSpan.end - startSpan.start,
text: spanBucket.map((entry) => entry.text).join('')
});
}
}

for(const span of spans) {
if(span.codepointLength == 1 && span.unmatched) {
spanBucket.push(span);
} else {
mergeBucket(spanBucket);
spanBucket = [];
finalSpans.push(span);
}
}

mergeBucket(spanBucket);

// ... and done!
return finalSpans;

/*
Important questions:
- What is the cheapest way to have a word-break boundary after this character?
- This is a 100% valid question; the complications arise in moving from an "earlier"
answer to a "later" answer.
- What words are possible to reach given recent possible boundaries?
- idea: keep a running 'tally' of all Traversals that have valid paths at the
current processing stage, noting what their starting points were.
- matches the approach seen above.
- Possible optimization: ... instead of 'tally'... 'priority queue'?
- cheapest (start point cost) + (current traversal min cost) makes a good
A* heuristic.
- valid heuristic - traversal min-cost will never overestimate.
- would likely avoid a need to search expensive branches this way.
- current correction considers fat-finger prob * correction cost.
- lexical probability only factors in 100%-after corrections, as a
final step, at present, hence why it's not currently available.
- if no longer valid, drop it from the 'tally' / the 'running'.
- after each codepoint, we always add a newly-started traversal.
- worst-case, it comes with a set penalty cost added to the previous
codepoint's cost.
- if a current traversal has entries, we have a direct candidate for best
cost at this location.
- if using the priority queue strat, we may need to process just enough entries
to where the next node is equal or higher cost to the selected entry.
- unless the cost crossed to reach it IS that cost.
- If multiple co-located entries, use the best cost of them all. They all
search-key to each other, anyway, so whatever is best is still "valid enough"
to trigger a boundary.
So...
O(N), N = length of string: loop over each codepoint
O(N [worst-case]): loop over each still-valid traversal
- at most, matches the index of the current codepoint
- in practice, will be NOTABLY fewer after the first few codepoints.
O(1): check if the Traversal can continue with the new incoming char.
*/

// 2. Initial state: one traversal @ root pointing to 'no ancestor'.
// 2b. Could prep to build a 'memo' of calcs reusable by later runs?
// - may be best to note "still-valid start locations at this node"
// 3. Run the boundary search - see the approx looping structure noted above.
// 4. Best answer at the end wins!
// 5. May be worth persisting a cache of recent memos, do a quick diff on the
// most recent as a step 0 in future runs, to reuse data?
// - but... how to clear that cache on model change?
// - duh! validate the passed-in root traversal. If unequal, is diff model. ezpz.

// return [];
}
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import placeholder from "./placeholder.js";
import ascii from "./ascii.js";
import dict from "./dict/index.js";
import default_ from "./default/index.js";

export { placeholder, ascii, default_ as default, default_ as defaultWordbreaker };
export { placeholder, ascii, default_ as default, default_ as defaultWordbreaker, dict };
Loading

0 comments on commit 39430bf

Please sign in to comment.