From 54bdf73c93b958df609c1103b07efc98aa5a8374 Mon Sep 17 00:00:00 2001 From: "Joshua A. Horton" Date: Fri, 2 Aug 2024 12:42:54 +0700 Subject: [PATCH 1/4] refactor(common/models): move primary trie-compilation code into common/models/templates We'll likely want to dynamically build a Trie to represent user-specific entries made available by the active OS. We'll then blend _that_ with the 'static' distributed model. --- common/models/templates/src/index.ts | 4 + common/models/templates/src/trie-builder.ts | 155 +++++++++ common/models/templates/src/trie.ts | 28 +- developer/src/kmc-model/src/build-trie.ts | 353 ++++---------------- 4 files changed, 250 insertions(+), 290 deletions(-) create mode 100644 common/models/templates/src/trie-builder.ts diff --git a/common/models/templates/src/index.ts b/common/models/templates/src/index.ts index 8563004fefa..b32c37d45a8 100644 --- a/common/models/templates/src/index.ts +++ b/common/models/templates/src/index.ts @@ -4,4 +4,8 @@ export { } from "./common.js"; export { default as QuoteBehavior } from "./quote-behavior.js"; export { Tokenization, tokenize, getLastPreCaretToken, wordbreak } from "./tokenization.js"; +export { + Entry, InternalNode, Leaf, Node +} from './trie.js'; +export { addUnsorted, TrieBuilder } from './trie-builder.js'; export { default as TrieModel, TrieModelOptions } from "./trie-model.js"; \ No newline at end of file diff --git a/common/models/templates/src/trie-builder.ts b/common/models/templates/src/trie-builder.ts new file mode 100644 index 00000000000..8b90b5f28d7 --- /dev/null +++ b/common/models/templates/src/trie-builder.ts @@ -0,0 +1,155 @@ +import { SENTINEL_CODE_UNIT, Wordform2Key } from "./common.js"; +import { Entry, InternalNode, Leaf, Node, Trie } from "./trie.js"; + +function createRootNode(): Node { + return { + type: 'leaf', + weight: 0, + entries: [] + }; +} + +/** + * Adds an entry to the trie. + * + * Note that the trie will likely be unsorted after the add occurs. Before + * performing a lookup on the trie, use call sortTrie() on the root note! + * + * @param node Which node should the entry be added to? + * @param entry the wordform/weight/key to add to the trie + * @param index the index in the key and also the trie depth. Should be set to + * zero when adding onto the root node of the trie. + */ +export function addUnsorted(node: Node, entry: Entry, index: number = 0) { + // Each node stores the MAXIMUM weight out of all of its decesdents, to + // enable a greedy search through the trie. + node.weight = Math.max(node.weight, entry.weight); + + // When should a leaf become an interior node? + // When it already has a value, but the key of the current value is longer + // than the prefix. + if (node.type === 'leaf' && index < entry.key.length && node.entries.length >= 1) { + convertLeafToInternalNode(node, index); + } + + if (node.type === 'leaf') { + // The key matches this leaf node, so add yet another entry. + addItemToLeaf(node, entry); + } else { + // Push the node down to a lower node. + addItemToInternalNode(node, entry, index); + } + + node.unsorted = true; +} + +/** + * Adds an item to the internal node at a given depth. + * @param item + * @param index + */ +function addItemToInternalNode(node: InternalNode, item: Entry, index: number) { + let char = item.key[index]; + // If an internal node is the proper site for item, it belongs under the + // corresponding (sentinel, internal-use) child node signifying this. + if(char == undefined) { + char = SENTINEL_CODE_UNIT; + } + if (!node.children[char]) { + node.children[char] = createRootNode(); + node.values.push(char); + } + addUnsorted(node.children[char], item, index + 1); +} + +function addItemToLeaf(leaf: Leaf, item: Entry) { + leaf.entries.push(item); +} + +/** + * Mutates the given Leaf to turn it into an InternalNode. + * + * NOTE: the node passed in will be DESTRUCTIVELY CHANGED into a different + * type when passed into this function! + * + * @param depth depth of the trie at this level. + */ +function convertLeafToInternalNode(leaf: Leaf, depth: number): void { + let entries = leaf.entries; + + // Alias the current node, as the desired type. + let internal = ( leaf) as InternalNode; + internal.type = 'internal'; + + delete (leaf as Partial).entries; + internal.values = []; + internal.children = {}; + + // Convert the old values array into the format for interior nodes. + for (let item of entries) { + let char: string; + if (depth < item.key.length) { + char = item.key[depth]; + } else { + char = SENTINEL_CODE_UNIT; + } + + if (!internal.children[char]) { + internal.children[char] = createRootNode(); + internal.values.push(char); + } + addUnsorted(internal.children[char], item, depth + 1); + } + + internal.unsorted = true; +} + +/** + * Recursively sort the trie, in descending order of weight. + * @param node any node in the trie + */ +function sortTrie(node: Node) { + if (node.type === 'leaf') { + if (!node.unsorted) { + return; + } + + node.entries.sort(function (a, b) { return b.weight - a.weight; }); + } else { + // We MUST recurse and sort children before returning. + for (let char of node.values) { + sortTrie(node.children[char]); + } + + if (!node.unsorted) { + return; + } + + node.values.sort((a, b) => { + return node.children[b].weight - node.children[a].weight; + }); + } + + delete node.unsorted; +} + +/** + * Wrapper class for the trie and its nodes. + */ +export class TrieBuilder extends Trie { + /** The total weight of the entire trie. */ + totalWeight: number; + + constructor(toKey: Wordform2Key) { + super(createRootNode(), 0, toKey); + this.totalWeight = 0; + } + + sort() { + sortTrie(this.root); + } + + getRoot(): Node { + return this.root; + } +} \ No newline at end of file diff --git a/common/models/templates/src/trie.ts b/common/models/templates/src/trie.ts index fc3ab7d5bc5..47dc6fe5dc6 100644 --- a/common/models/templates/src/trie.ts +++ b/common/models/templates/src/trie.ts @@ -23,12 +23,22 @@ export interface InternalNode { * in sorted order in the .values array. */ children: { [codeunit: string]: Node }; + + /** + * Used during compilation. + */ + unsorted?: boolean; } /** Only leaf nodes actually contain entries (i.e., the words proper). */ export interface Leaf { type: 'leaf'; weight: number; entries: Entry[]; + + /** + * Used during compilation. + */ + unsorted?: boolean; } /** @@ -88,12 +98,26 @@ export class TrieTraversal implements LexiconTraversal { return traversal; } + private sortNodeIfNeeded(node: Node) { + if(node.unsorted) { + if(node.type == 'leaf') { + node.entries.sort((a, b) => b.weight - a.weight) + } else { + node.values.sort((a, b) => node.children[b].weight - node.children[a].weight); + } + + node.unsorted = false; + } + } + // Handles one code unit at a time. private _child(char: USVString): TrieTraversal | undefined { const root = this.root; const totalWeight = this.totalWeight; const nextPrefix = this.prefix + char; + this.sortNodeIfNeeded(root); + if(root.type == 'internal') { let childNode = root.children[char]; if(!childNode) { @@ -119,6 +143,8 @@ export class TrieTraversal implements LexiconTraversal { let root = this.root; const totalWeight = this.totalWeight; + this.sortNodeIfNeeded(root); + if(root.type == 'internal') { for(let entry of root.values) { let entryNode = root.children[entry]; @@ -223,7 +249,7 @@ export class TrieTraversal implements LexiconTraversal { * Wrapper class for the trie and its nodes. */ export class Trie { - private root: Node; + protected root: Node; /** The total weight of the entire trie. */ readonly totalWeight: number; /** diff --git a/developer/src/kmc-model/src/build-trie.ts b/developer/src/kmc-model/src/build-trie.ts index 76da23662fe..efcda3d38cf 100644 --- a/developer/src/kmc-model/src/build-trie.ts +++ b/developer/src/kmc-model/src/build-trie.ts @@ -1,6 +1,8 @@ import { ModelCompilerError, ModelCompilerMessageContext, ModelCompilerMessages } from "./model-compiler-messages.js"; import { callbacks } from "./compiler-callbacks.js"; +import { addUnsorted, Node, TrieBuilder } from '@keymanapp/models-templates'; + // Supports LF or CRLF line terminators. const NEWLINE_SEPARATOR = /\u000d?\u000a/; @@ -29,7 +31,7 @@ export function createTrieDataStructure(filenames: string[], searchTermToKey?: ( let wordlist: WordList = {}; filenames.forEach(filename => parseWordListFromFilename(wordlist, filename)); - let trie = Trie.buildTrie(wordlist, searchTermToKey as Trie.SearchTermToKey); + let trie = buildTrie(wordlist, searchTermToKey as SearchTermToKey); return JSON.stringify(trie); } @@ -185,304 +187,77 @@ function* enumerateLines(lines: string[]): Generator { } } -namespace Trie { - /** - * An **opaque** type for a string that is exclusively used as a search key in - * the trie. There should be a function that converts arbitrary strings - * (queries) and converts them into a standard search key for a given language - * model. - * - * Fun fact: This opaque type has ALREADY saved my bacon and found a bug! - */ - type SearchKey = string & { _: 'SearchKey'}; - - /** - * A function that converts a string (word form or query) into a search key - * (secretly, this is also a string). - */ - export interface SearchTermToKey { - (wordform: string): SearchKey; - } - - // The following trie implementation has been (heavily) derived from trie-ing - // by Conrad Irwin. - // - // trie-ing is distributed under the terms of the MIT license, reproduced here: - // - // The MIT License - // Copyright (c) 2015-2017 Conrad Irwin - // Copyright (c) 2011 Marc Campbell - // - // Permission is hereby granted, free of charge, to any person obtaining a copy - // of this software and associated documentation files (the "Software"), to deal - // in the Software without restriction, including without limitation the rights - // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - // copies of the Software, and to permit persons to whom the Software is - // furnished to do so, subject to the following conditions: - // - // The above copyright notice and this permission notice shall be included in - // all copies or substantial portions of the Software. - // - // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - // THE SOFTWARE. - // - // See: https://github.com/ConradIrwin/trie-ing/blob/df55d7af7068d357829db9e0a7faa8a38add1d1d/LICENSE - - /** - * An entry in the prefix trie. The matched word is "content". - */ - interface Entry { - content: string; - key: SearchKey; - weight: number; - } - - /** - * The trie is made up of nodes. A node can be EITHER an internal node (whose - * only children are other nodes) OR a leaf, which actually contains the word - * form entries. - */ - type Node = InternalNode | Leaf; - - /** - * An internal node. - */ - interface InternalNode { - type: 'internal'; - weight: number; - // TODO: As an optimization, "values" can be a single string! - values: string[]; - children: { [codeunit: string]: Node }; - unsorted?: true; - } - - /** - * A leaf node. - */ - interface Leaf { - type: 'leaf'; - weight: number; - entries: Entry[]; - unsorted?: true; - } - - /** - * A sentinel value for when an internal node has contents and requires an - * "internal" leaf. That is, this internal node has content. Instead of placing - * entries as children in an internal node, a "fake" leaf is created, and its - * key is this special internal value. - * - * The value is a valid Unicode BMP code point, but it is a "non-character". - * Unicode will never assign semantics to these characters, as they are - * intended to be used internally as sentinel values. - */ - const INTERNAL_VALUE = '\uFDD0'; - - /** - * Builds a trie from a word list. - * - * @param wordlist The wordlist with non-negative weights. - * @param keyFunction Function that converts word forms into indexed search keys - * @returns A JSON-serialiable object that can be given to the TrieModel constructor. - */ - export function buildTrie(wordlist: WordList, keyFunction: SearchTermToKey): object { - let trie = new Trie(keyFunction); - buildFromWordList(trie, wordlist); - const root = trie.root; - return { - totalWeight: sumWeights(root), - root: root - } - } - - /** - * Populates the trie with the contents of an entire wordlist. - * @param words a list of word and count pairs. - */ - function buildFromWordList(trie: Trie, words: WordList): Trie { - for (let [wordform, weight] of Object.entries(words)) { - let key = trie.toKey(wordform); - addUnsorted(trie.root, { key, weight, content: wordform }, 0); - } - sortTrie(trie.root); - return trie; - } - - /** - * Wrapper class for the trie and its nodes and wordform to search - */ - class Trie { - readonly root = createRootNode(); - toKey: SearchTermToKey; - constructor(wordform2key: SearchTermToKey) { - this.toKey = wordform2key; - } - } - - // "Constructors" - function createRootNode(): Node { - return { - type: 'leaf', - weight: 0, - entries: [] - }; - } - - // Implement Trie creation. - - /** - * Adds an entry to the trie. - * - * Note that the trie will likely be unsorted after the add occurs. Before - * performing a lookup on the trie, use call sortTrie() on the root note! - * - * @param node Which node should the entry be added to? - * @param entry the wordform/weight/key to add to the trie - * @param index the index in the key and also the trie depth. Should be set to - * zero when adding onto the root node of the trie. - */ - function addUnsorted(node: Node, entry: Entry, index: number = 0) { - // Each node stores the MAXIMUM weight out of all of its decesdents, to - // enable a greedy search through the trie. - node.weight = Math.max(node.weight, entry.weight); - - // When should a leaf become an interior node? - // When it already has a value, but the key of the current value is longer - // than the prefix. - if (node.type === 'leaf' && index < entry.key.length && node.entries.length >= 1) { - convertLeafToInternalNode(node, index); - } - - if (node.type === 'leaf') { - // The key matches this leaf node, so add yet another entry. - addItemToLeaf(node, entry); - } else { - // Push the node down to a lower node. - addItemToInternalNode(node, entry, index); - } +/** + * An **opaque** type for a string that is exclusively used as a search key in + * the trie. There should be a function that converts arbitrary strings + * (queries) and converts them into a standard search key for a given language + * model. + * + * Fun fact: This opaque type has ALREADY saved my bacon and found a bug! + */ +type SearchKey = string & { _: 'SearchKey'}; - node.unsorted = true; - } +/** + * A function that converts a string (word form or query) into a search key + * (secretly, this is also a string). + */ +export interface SearchTermToKey { + (wordform: string): SearchKey; +} - /** - * Adds an item to the internal node at a given depth. - * @param item - * @param index - */ - function addItemToInternalNode(node: InternalNode, item: Entry, index: number) { - let char = item.key[index]; - // If an internal node is the proper site for item, it belongs under the - // corresponding (sentinel, internal-use) child node signifying this. - if(char == undefined) { - char = INTERNAL_VALUE; - } - if (!node.children[char]) { - node.children[char] = createRootNode(); - node.values.push(char); - } - addUnsorted(node.children[char], item, index + 1); - } +/** + * Builds a trie from a word list. + * + * @param wordlist The wordlist with non-negative weights. + * @param keyFunction Function that converts word forms into indexed search keys + * @returns A JSON-serialiable object that can be given to the TrieModel constructor. + */ +export function buildTrie(wordlist: WordList, keyFunction: SearchTermToKey): object { + let collater = new TrieBuilder(keyFunction); - function addItemToLeaf(leaf: Leaf, item: Entry) { - leaf.entries.push(item); + buildFromWordList(collater, wordlist); + return { + totalWeight: sumWeights(collater.getRoot()), + root: collater.getRoot() } +} - /** - * Mutates the given Leaf to turn it into an InternalNode. - * - * NOTE: the node passed in will be DESTRUCTIVELY CHANGED into a different - * type when passed into this function! - * - * @param depth depth of the trie at this level. - */ - function convertLeafToInternalNode(leaf: Leaf, depth: number): void { - let entries = leaf.entries; - - // Alias the current node, as the desired type. - let internal = ( leaf) as InternalNode; - internal.type = 'internal'; - - delete leaf.entries; - internal.values = []; - internal.children = {}; - - // Convert the old values array into the format for interior nodes. - for (let item of entries) { - let char: string; - if (depth < item.key.length) { - char = item.key[depth]; - } else { - char = INTERNAL_VALUE; - } - - if (!internal.children[char]) { - internal.children[char] = createRootNode(); - internal.values.push(char); - } - addUnsorted(internal.children[char], item, depth + 1); - } - - internal.unsorted = true; +/** + * Populates the trie with the contents of an entire wordlist. + * @param words a list of word and count pairs. + */ +function buildFromWordList(trieCollator: TrieBuilder, words: WordList): TrieBuilder { + for (let [wordform, weight] of Object.entries(words)) { + let key = trieCollator.toKey(wordform); + addUnsorted(trieCollator.getRoot(), { content: wordform, key, weight }); } + trieCollator.sort(); + return trieCollator; +} - /** - * Recursively sort the trie, in descending order of weight. - * @param node any node in the trie - */ - function sortTrie(node: Node) { - if (node.type === 'leaf') { - if (!node.unsorted) { - return; - } - - node.entries.sort(function (a, b) { return b.weight - a.weight; }); - } else { - // We MUST recurse and sort children before returning. - for (let char of node.values) { - sortTrie(node.children[char]); - } - - if (!node.unsorted) { - return; - } - - node.values.sort((a, b) => { - return node.children[b].weight - node.children[a].weight; - }); - } - - delete node.unsorted; +/** + * O(n) recursive traversal to sum the total weight of all leaves in the + * trie, starting at the provided node. + * + * @param node The node to start summing weights. + */ +function sumWeights(node: Node): number { + let val: number; + if (node.type === 'leaf') { + val = node.entries + .map(entry => entry.weight) + //.map(entry => isNaN(entry.weight) ? 1 : entry.weight) + .reduce((acc, count) => acc + count, 0); + } else { + val = Object.keys(node.children) + .map((key) => sumWeights(node.children[key])) + .reduce((acc, count) => acc + count, 0); } - /** - * O(n) recursive traversal to sum the total weight of all leaves in the - * trie, starting at the provided node. - * - * @param node The node to start summing weights. - */ - function sumWeights(node: Node): number { - let val: number; - if (node.type === 'leaf') { - val = node.entries - .map(entry => entry.weight) - //.map(entry => isNaN(entry.weight) ? 1 : entry.weight) - .reduce((acc, count) => acc + count, 0); - } else { - val = Object.keys(node.children) - .map((key) => sumWeights(node.children[key])) - .reduce((acc, count) => acc + count, 0); - } - - if(isNaN(val)) { - throw new Error("Unexpected NaN has appeared!"); - } - return val; + if(isNaN(val)) { + throw new Error("Unexpected NaN has appeared!"); } + return val; } /** From 32a31bc693003b84d553e035b88cc5da5c028f91 Mon Sep 17 00:00:00 2001 From: "Joshua A. Horton" Date: Fri, 2 Aug 2024 13:36:07 +0700 Subject: [PATCH 2/4] fix(developer): add 'build' dependency link to models/templates --- developer/src/kmc-model/build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/developer/src/kmc-model/build.sh b/developer/src/kmc-model/build.sh index c549648b451..d0cf48817bb 100755 --- a/developer/src/kmc-model/build.sh +++ b/developer/src/kmc-model/build.sh @@ -13,7 +13,7 @@ THIS_SCRIPT="$(readlink -f "${BASH_SOURCE[0]}")" builder_describe "Keyman kmc Lexical Model Compiler module" \ "@/common/web/keyman-version" \ "@/developer/src/common/web/test-helpers" \ - "@/common/models/templates test" \ + "@/common/models/templates" \ "clean" \ "configure" \ "build" \ From f6e87f8c74d48c9ea4a4c49676dadc75a71682e7 Mon Sep 17 00:00:00 2001 From: "Joshua A. Horton" Date: Fri, 9 Aug 2024 13:18:41 +0700 Subject: [PATCH 3/4] fix(developer): sets models-templates as package.json dependency --- developer/src/kmc-model/package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/developer/src/kmc-model/package.json b/developer/src/kmc-model/package.json index da73aabe7b1..eb60642fb1c 100644 --- a/developer/src/kmc-model/package.json +++ b/developer/src/kmc-model/package.json @@ -32,12 +32,12 @@ "dependencies": { "@keymanapp/common-types": "*", "@keymanapp/keyman-version": "*", + "@keymanapp/models-templates": "*", "@keymanapp/models-types": "*", "typescript": "^5.4.5" }, "devDependencies": { "@keymanapp/developer-test-helpers": "*", - "@keymanapp/models-templates": "*", "@types/mocha": "^5.2.7", "@types/node": "^20.4.1", "c8": "^7.12.0", From 894721dafca48a9ec76288f6b13e369c0bdcbdd1 Mon Sep 17 00:00:00 2001 From: "Joshua A. Horton" Date: Fri, 9 Aug 2024 14:01:31 +0700 Subject: [PATCH 4/4] chore(common): update package-lock.json --- package-lock.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package-lock.json b/package-lock.json index 098c99700f0..fa7423246eb 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1419,12 +1419,12 @@ "dependencies": { "@keymanapp/common-types": "*", "@keymanapp/keyman-version": "*", + "@keymanapp/models-templates": "*", "@keymanapp/models-types": "*", "typescript": "^5.4.5" }, "devDependencies": { "@keymanapp/developer-test-helpers": "*", - "@keymanapp/models-templates": "*", "@types/mocha": "^5.2.7", "@types/node": "^20.4.1", "c8": "^7.12.0",