Skip to content

Commit

Permalink
fix(web): improve tokenization output when wordbreaker breaks spec fo…
Browse files Browse the repository at this point in the history
…r span .left, .right values in output
  • Loading branch information
jahorton committed Aug 20, 2024
1 parent 5bf42c5 commit 9227569
Show file tree
Hide file tree
Showing 3 changed files with 219 additions and 12 deletions.
28 changes: 16 additions & 12 deletions common/models/templates/src/tokenization.ts
Original file line number Diff line number Diff line change
Expand Up @@ -61,20 +61,21 @@ export function tokenize(
let currentIndex = 0;
while(leftSpans.length > 0) {
const nextSpan = leftSpans[0];
if(nextSpan.start != currentIndex) {
if(Math.max(nextSpan.start, currentIndex) != currentIndex) {
const nextIndex = Math.max(currentIndex, nextSpan.start);
// Implicit whitespace span!
tokenization.left.push({
text: context.left!.substring(currentIndex, nextSpan.start),
text: context.left!.substring(currentIndex, nextIndex),
isWhitespace: true
});
currentIndex = nextSpan.start;
currentIndex = nextIndex;
} else {
leftSpans.shift();
// Explicit non-whitespace span.
tokenization.left.push({
text: nextSpan.text
});
currentIndex = nextSpan.end;
currentIndex = Math.max(currentIndex, nextSpan.end);
}
}

Expand All @@ -84,11 +85,12 @@ export function tokenize(
// Note: the default wordbreaker won't need this code, as it emits a `''`
// after final whitespace.
if(context.left != null && currentIndex != context.left.length) {
const nextIndex = Math.max(currentIndex, context.left!.length);
tokenization.left.push({
text: context.left.substring(currentIndex, context.left!.length),
text: context.left.substring(currentIndex, nextIndex),
isWhitespace: true
});
currentIndex = context.left!.length;
currentIndex = nextIndex;
}

// New step 2: handle any rejoins needed.
Expand Down Expand Up @@ -134,13 +136,14 @@ export function tokenize(
// `caretSplitsToken` check is additional.
while(rightSpans.length > 0) {
const nextSpan = rightSpans[0];
if(nextSpan.start != currentIndex) {
if(Math.max(nextSpan.start, currentIndex) != currentIndex) {
const nextIndex = Math.max(currentIndex, nextSpan.start);
// Implicit whitespace span!
tokenization.right.push({
text: context.right!.substring(currentIndex, nextSpan.start),
text: context.right!.substring(currentIndex, nextIndex),
isWhitespace: true
});
currentIndex = nextSpan.start;
currentIndex = nextIndex;
} else {
const leftTail = tokenization.left[leftTokenCount-1];
if(leftTail) {
Expand All @@ -159,7 +162,7 @@ export function tokenize(
tokenization.right.push({
text: nextSpan.text
});
currentIndex = nextSpan.end;
currentIndex = Math.max(currentIndex, nextSpan.end);
}

// We've always processed the "first right token" after the first iteration.
Expand All @@ -176,11 +179,12 @@ export function tokenize(
// Also note: is pretty much WET with the similar check after the
// leftSpan loop.
if(context.right && currentIndex != context.right.length) {
const nextIndex = Math.max(currentIndex, context.right!.length);
tokenization.right.push({
text: context.right.substring(currentIndex, context.right!.length),
text: context.right.substring(currentIndex, nextIndex),
isWhitespace: true
});
currentIndex = context.right!.length;
currentIndex = nextIndex;
}

return tokenization;
Expand Down
128 changes: 128 additions & 0 deletions common/models/templates/test/custom-breakers.def.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
/**
* The custom wordbreaker used by the sil.km.gcc model as of
* https://github.com/keymanapp/lexical-models/pull/265.
* @type {WordBreakingFunction}
* */
export function customWordBreakerProper(str) {
const whitespaceRegex = /\s|\u200b|\n|\r/;
const tokens = str.split(whitespaceRegex);

for(let i=0; i < tokens.length; i++) {
const token = tokens[i];
if(token.length == 0) {
tokens.splice(i, 1);
i--;
continue;
} else if(token.length == 1 && whitespaceRegex.test(token)) {
tokens.splice(i, 1);
i--;
continue;
}

// Certain punctuation marks should be considered a separate token from the word they're next to.
const punctuationMarks = ['«', '»', '$', '#' /* add extras here */];
const punctSplitIndices = [];

// Find if and where each mark exists within the token
for(let i = 0; i < punctuationMarks.length; i++) {
const split = token.indexOf(punctuationMarks[i]);
if(split >= 0) {
punctSplitIndices.push(split);
}
}

// Sort and pick the earliest mark's location. If none exists, use -1.
punctSplitIndices.sort();
const splitPoint = punctSplitIndices[0] === undefined ? -1 : punctSplitIndices[0];

if(splitPoint > -1) {
const left = token.substring(0, splitPoint); // (0, -1) => ''
const punct = token.substring(splitPoint, splitPoint+1);
const right = token.substring(splitPoint+1); // Starting past the end of the string => ''

if(left) {
tokens.splice(i++, 0, left);
}
tokens.splice(i++, 1, punct);
if(right) {
tokens.splice(i, 0, right);
}
// Ensure that the next iteration puts `i` immediately after the punctuation token... even if
// there was a `right` portion, as it may have extra marks that also need to be spun off.
i--;
}
}

let latestIndex = 0;
return tokens.map(function(token) {
const start = str.indexOf(token, latestIndex);
latestIndex = start + token.length;
return {
left: start,
start: start,
right: start + token.length,
end: start + token.length,
length: token.length,
text: token
}
});
}

/**
* The version of the custom wordbreaker used by the sil.km.gcc model
* before https://github.com/keymanapp/lexical-models/pull/265, which
* triggered #12200.
* @type {WordBreakingFunction}
* */
export function customWordBreakerFormer (str) {
const tokens = str.split(/\s|\u200b/);

for(let i=0; i < tokens.length; i++) {
const token = tokens[i];
if(token.length == 1) {
continue;
}

// Certain punctuation marks should be considered a separate token from the word they're next to.
const punctuationMarks = ['«', '»', '$', '#' /* add extras here */];
const punctSplitIndices = [];

// Find if and where each mark exists within the token
for(let i = 0; i < punctuationMarks.length; i++) {
const split = token.indexOf(punctuationMarks[i]);
if(split >= 0) {
punctSplitIndices.push(split);
}
}

// Sort and pick the earliest mark's location. If none exists, use -1.
punctSplitIndices.sort();
const splitPoint = punctSplitIndices[0] === undefined ? -1 : punctSplitIndices[0];

if(splitPoint > -1) {
const left = token.substring(0, splitPoint); // (0, -1) => ''
const punct = token.substring(splitPoint, splitPoint+1);
const right = token.substring(splitPoint+1); // Starting past the end of the string => ''

if(left) {
tokens.splice(i++, 0, left);
}
tokens.splice(i++, 1, punct);
if(right) {
tokens.splice(i, 0, right);
}
// Ensure that the next iteration puts `i` immediately after the punctuation token... even if
// there was a `right` portion, as it may have extra marks that also need to be spun off.
i--;
}
}
return tokens.map(function(token) {
return {
left: str.indexOf(token),
start: str.indexOf(token),
right: str.indexOf(token) + token.length,
end: str.indexOf(token) + token.length,
text: token
}
});
}
75 changes: 75 additions & 0 deletions common/models/templates/test/test-tokenization.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import { assert } from 'chai';
import * as models from "@keymanapp/models-templates";
import * as wordBreakers from "@keymanapp/models-wordbreakers";
import { customWordBreakerFormer, customWordBreakerProper } from './custom-breakers.def.js';

function asProcessedToken(text) {
// default wordbreaker emits these at the end of each context half if ending with whitespace.
Expand Down Expand Up @@ -490,6 +491,80 @@ describe('Tokenization functions', function() {
caretSplitsToken: true
});
});

it('mitigates effects of previously-distributed malformed wordbreaker output', function () {
const text = 'the quick brown fox jumped over the lazy dog ';
/** @type { Context } */
const context = {
left: text,
right: '',
startOfBuffer: true,
endOfBuffer: true
}

const tokenized = models.tokenize(customWordBreakerFormer, context);

// Mitigation aims to prevent the _worst_ side-effects that can result from invalidating the
// underlying assumption of a monotonically-increasing index within the context -
// assigning repeated or blank entries the text that preceded them!
assert.notExists(tokenized.left.find((token) => token.text == text));
assert.notExists(tokenized.left.find((token) => token.text.startsWith(text.substring(0, 25))));

// 'the' appears twice in the context, which should result in two separate 'the' tokens here.
// This was improperly handled when we didn't check that assumption.
assert.equal(tokenized.left.filter((token) => token.text == 'the').length, 2);

// Does not address multiple blank-token ('') entries that result from intervening spaces;
// that would add too much extra complexity to the method... and it can already be
// handled decently by the predictive-text engine.
});

it('properly works with well-formed custom wordbreaker output', function () {
const text = 'the quick brown fox jumped over the lazy dog ';
/** @type { Context } */
const context = {
left: text,
right: '',
startOfBuffer: true,
endOfBuffer: true
}

const tokenized = models.tokenize(customWordBreakerProper, context);

// Easier-to-parse version
assert.deepEqual(
tokenized.left
.filter((entry) => !entry.isWhitespace)
.map((entry) => entry.text),
['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog']
);

// This time, with whitespaces.
assert.deepEqual(
tokenized.left.map((entry) => entry.text), [
'the',
' ',
'quick',
' ',
'brown',
' ',
'fox',
' ',
'jumped',
' ',
'over',
' ',
'the',
' ',
'lazy',
' ',
'dog',
' '
]
);
});

//
});

describe('getLastPreCaretToken', function() {
Expand Down

0 comments on commit 9227569

Please sign in to comment.