Skip to content

Commit

Permalink
feat(tokenize): add basic tokenizer implementations (#109)
Browse files Browse the repository at this point in the history
  • Loading branch information
nbsp committed Nov 4, 2024
1 parent 0e40262 commit d32f247
Show file tree
Hide file tree
Showing 13 changed files with 976 additions and 27 deletions.
5 changes: 5 additions & 0 deletions .changeset/funny-adults-brake.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"@livekit/agents": minor
---

add basic tokenizer implementations
4 changes: 2 additions & 2 deletions agents/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import * as cli from './cli.js';
import * as llm from './llm/index.js';
import * as multimodal from './multimodal/index.js';
import * as stt from './stt/index.js';
import * as tokenize from './tokenize/index.js';
import * as tts from './tts/index.js';

export * from './vad.js';
Expand All @@ -23,8 +24,7 @@ export * from './worker.js';
export * from './utils.js';
export * from './log.js';
export * from './generator.js';
export * from './tokenize.js';
export * from './audio.js';
export * from './transcription.js';

export { cli, stt, tts, llm, multimodal };
export { cli, stt, tts, llm, multimodal, tokenize };
22 changes: 0 additions & 22 deletions agents/src/tokenize.ts

This file was deleted.

73 changes: 73 additions & 0 deletions agents/src/tokenize/basic/basic.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
//
// SPDX-License-Identifier: Apache-2.0
import * as tokenizer from '../index.js';
import { BufferedSentenceStream } from '../token_stream.js';
import { hyphenator } from './hyphenator.js';
import { splitParagraphs } from './paragraph.js';
import { splitSentences } from './sentence.js';
import { splitWords } from './word';

interface TokenizerOptions {
language: string;
minSentenceLength: number;
streamContextLength: number;
}

export class SentenceTokenizer extends tokenizer.SentenceTokenizer {
#config: TokenizerOptions;

constructor(language = 'en-US', minSentenceLength = 20, streamContextLength = 10) {
super();
this.#config = {
language,
minSentenceLength,
streamContextLength,
};
}

// eslint-disable-next-line @typescript-eslint/no-unused-vars
tokenize(text: string, language?: string): string[] {
return splitSentences(text, this.#config.minSentenceLength).map((tok) => tok[0]);
}

// eslint-disable-next-line @typescript-eslint/no-unused-vars
stream(language?: string): tokenizer.SentenceStream {
return new BufferedSentenceStream(
(text: string) => splitSentences(text, this.#config.minSentenceLength),
this.#config.minSentenceLength,
this.#config.streamContextLength,
);
}
}

export class WordTokenizer extends tokenizer.SentenceTokenizer {
#ignorePunctuation: boolean;

constructor(ignorePunctuation = true) {
super();
this.#ignorePunctuation = ignorePunctuation;
}

// eslint-disable-next-line @typescript-eslint/no-unused-vars
tokenize(text: string, language?: string): string[] {
return splitWords(text, this.#ignorePunctuation).map((tok) => tok[0]);
}

// eslint-disable-next-line @typescript-eslint/no-unused-vars
stream(language?: string): tokenizer.SentenceStream {
return new BufferedSentenceStream(
(text: string) => splitWords(text, this.#ignorePunctuation),
1,
1,
);
}
}

export const hyphenateWord = (word: string): string[] => {
return hyphenator.hyphenateWord(word);
};

export const tokenizeParagraphs = (text: string): string[] => {
return splitParagraphs(text).map((tok) => tok[0]);
};
436 changes: 436 additions & 0 deletions agents/src/tokenize/basic/hyphenator.ts

Large diffs are not rendered by default.

5 changes: 5 additions & 0 deletions agents/src/tokenize/basic/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
//
// SPDX-License-Identifier: Apache-2.0

export { SentenceTokenizer, WordTokenizer, tokenizeParagraphs, hyphenateWord } from './basic.js';
43 changes: 43 additions & 0 deletions agents/src/tokenize/basic/paragraph.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
//
// SPDX-License-Identifier: Apache-2.0

/**
* Split the text into paragraphs.
*/
export const splitParagraphs = (text: string): [string, number, number][] => {
const re = /\n\s*\n/g;
const splits = Array.from(text.matchAll(re));

const paragraphs: [string, number, number][] = [];
let start = 0;

// no splits (single paragraph)
if (splits.length === 0) {
const stripped = text.trim();
if (!stripped) return paragraphs;

const start = text.indexOf(stripped);
return [[stripped, start, start + stripped.length]];
}

for (const split of splits) {
const end = split.index!;
const paragraph = text.slice(start, end).trim();
if (paragraph) {
const paragraphStart = start + text.slice(start, end).indexOf(paragraph);
const paragraphEnd = paragraphStart + paragraph.length;
paragraphs.push([paragraph, paragraphStart, paragraphEnd]);
}
start = end + split[0].length;
}

const lastParagraph = text.slice(start).trim();
if (lastParagraph) {
const paragraphStart = start + text.slice(start).indexOf(lastParagraph);
const paragraphEnd = paragraphStart + lastParagraph.length;
paragraphs.push([lastParagraph, paragraphStart, paragraphEnd]);
}

return paragraphs;
};
69 changes: 69 additions & 0 deletions agents/src/tokenize/basic/sentence.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
//
// SPDX-License-Identifier: Apache-2.0

/**
* Split the text into sentences.
*/
export const splitSentences = (text: string, minLength = 20): [string, number, number][] => {
const alphabets = /([A-Za-z])/g;
const prefixes = /(Mr|St|Mrs|Ms|Dr)[.]/g;
const suffixes = /(Inc|Ltd|Jr|Sr|Co)/g;
const starters =
/(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)/g;
const acronyms = /([A-Z][.][A-Z][.](?:[A-Z][.])?)/g;
const websites = /[.](com|net|org|io|gov|edu|me)/g;
const digits = /([0-9])/g;
const dots = /\.{2,}/g;

text = text.replaceAll('\n', ' ');
text = text.replaceAll(prefixes, '$1<prd>');
text = text.replaceAll(websites, '<prd>$2');
text = text.replaceAll(new RegExp(`${digits}[.]${digits}`, 'g'), '$1<prd>$2');
text = text.replaceAll(dots, (match) => '<prd>'.repeat(match.length));
text = text.replaceAll('Ph.D.', 'Ph<prd>D<prd>');
text = text.replaceAll(new RegExp(`\s${alphabets}[.]`, 'g'), '$1<prd>');
text = text.replaceAll(new RegExp(`${acronyms} ${starters}`, 'g'), '$1<stop> $2');
text = text.replaceAll(
new RegExp(`${alphabets}[.]${alphabets}[.]${alphabets}[.]`, 'g'),
'$1<prd>$2<prd>$3<prd>',
);
text = text.replaceAll(new RegExp(`${alphabets}[.]${alphabets}[.]`, 'g'), '$1<prd>$2<prd>');
text = text.replaceAll(new RegExp(` ${suffixes}[.] ${starters}`, 'g'), '$1<stop> $2');
text = text.replaceAll(new RegExp(` ${suffixes}[.]`, 'g'), '$1<prd>');
text = text.replaceAll(new RegExp(` ${alphabets}[.]`, 'g'), '$1<prd>');
text = text.replaceAll('.”', '”.');
text = text.replaceAll('."', '".');
text = text.replaceAll('!"', '"!');
text = text.replaceAll('?"', '"?');
text = text.replaceAll('.', '.<stop>');
text = text.replaceAll('?', '?<stop>');
text = text.replaceAll('!', '!<stop>');
text = text.replaceAll('<prd>', '.');

const split = text.split('<stop>');
text = text.replaceAll('<stop>', '');

const sentences: [string, number, number][] = [];
let buf = '';
let start = 0;
let end = 0;
for (const match of split) {
const sentence = match.trim();
if (!sentence) continue;

buf += ' ' + sentence;
end += match.length;
if (buf.length > minLength) {
sentences.push([buf.slice(1), start, end]);
start = end;
buf = '';
}
}

if (buf) {
sentences.push([buf.slice(1), start, text.length - 1]);
}

return sentences;
};
27 changes: 27 additions & 0 deletions agents/src/tokenize/basic/word.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
//
// SPDX-License-Identifier: Apache-2.0
import { PUNCTUATIONS } from '../tokenizer.js';

/**
* Split the text into words.
*/
export const splitWords = (text: string, ignorePunctuation = true): [string, number, number][] => {
const re = /\S+/g;
const words: [string, number, number][] = [];

let arr;
while ((arr = re.exec(text)) !== null) {
let word = arr[0];
const start = arr.index;
const end = start + word.length;

if (ignorePunctuation) {
word = word.replace(new RegExp(`[${PUNCTUATIONS.join('')}]`, 'g'), '');
}

words.push([word, start, end]);
}

return words;
};
16 changes: 16 additions & 0 deletions agents/src/tokenize/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
//
// SPDX-License-Identifier: Apache-2.0
import * as basic from './basic/index.js';

export {
type TokenData,
SentenceTokenizer,
SentenceStream,
WordTokenizer,
WordStream,
} from './tokenizer.js';

export { BufferedSentenceStream, BufferedTokenStream } from './token_stream.js';

export { basic };
Loading

0 comments on commit d32f247

Please sign in to comment.