feat(tokenize): add basic tokenizer implementations (#109)

livekit · Nov 4, 2024 · d32f247 · d32f247
1 parent 0e40262
commit d32f247
Show file tree

Hide file tree

Showing 13 changed files with 976 additions and 27 deletions.
diff --git a/.changeset/funny-adults-brake.md b/.changeset/funny-adults-brake.md
@@ -0,0 +1,5 @@
+---
+"@livekit/agents": minor
+---
+
+add basic tokenizer implementations
diff --git a/agents/src/index.ts b/agents/src/index.ts
@@ -13,6 +13,7 @@ import * as cli from './cli.js';
 import * as llm from './llm/index.js';
 import * as multimodal from './multimodal/index.js';
 import * as stt from './stt/index.js';
+import * as tokenize from './tokenize/index.js';
 import * as tts from './tts/index.js';
 
 export * from './vad.js';
@@ -23,8 +24,7 @@ export * from './worker.js';
 export * from './utils.js';
 export * from './log.js';
 export * from './generator.js';
-export * from './tokenize.js';
 export * from './audio.js';
 export * from './transcription.js';
 
-export { cli, stt, tts, llm, multimodal };
+export { cli, stt, tts, llm, multimodal, tokenize };
diff --git a/agents/src/tokenize.ts b/agents/src/tokenize.ts
diff --git a/agents/src/tokenize/basic/basic.ts b/agents/src/tokenize/basic/basic.ts
@@ -0,0 +1,73 @@
+// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+import * as tokenizer from '../index.js';
+import { BufferedSentenceStream } from '../token_stream.js';
+import { hyphenator } from './hyphenator.js';
+import { splitParagraphs } from './paragraph.js';
+import { splitSentences } from './sentence.js';
+import { splitWords } from './word';
+
+interface TokenizerOptions {
+  language: string;
+  minSentenceLength: number;
+  streamContextLength: number;
+}
+
+export class SentenceTokenizer extends tokenizer.SentenceTokenizer {
+  #config: TokenizerOptions;
+
+  constructor(language = 'en-US', minSentenceLength = 20, streamContextLength = 10) {
+    super();
+    this.#config = {
+      language,
+      minSentenceLength,
+      streamContextLength,
+    };
+  }
+
+  // eslint-disable-next-line @typescript-eslint/no-unused-vars
+  tokenize(text: string, language?: string): string[] {
+    return splitSentences(text, this.#config.minSentenceLength).map((tok) => tok[0]);
+  }
+
+  // eslint-disable-next-line @typescript-eslint/no-unused-vars
+  stream(language?: string): tokenizer.SentenceStream {
+    return new BufferedSentenceStream(
+      (text: string) => splitSentences(text, this.#config.minSentenceLength),
+      this.#config.minSentenceLength,
+      this.#config.streamContextLength,
+    );
+  }
+}
+
+export class WordTokenizer extends tokenizer.SentenceTokenizer {
+  #ignorePunctuation: boolean;
+
+  constructor(ignorePunctuation = true) {
+    super();
+    this.#ignorePunctuation = ignorePunctuation;
+  }
+
+  // eslint-disable-next-line @typescript-eslint/no-unused-vars
+  tokenize(text: string, language?: string): string[] {
+    return splitWords(text, this.#ignorePunctuation).map((tok) => tok[0]);
+  }
+
+  // eslint-disable-next-line @typescript-eslint/no-unused-vars
+  stream(language?: string): tokenizer.SentenceStream {
+    return new BufferedSentenceStream(
+      (text: string) => splitWords(text, this.#ignorePunctuation),
+      1,
+      1,
+    );
+  }
+}
+
+export const hyphenateWord = (word: string): string[] => {
+  return hyphenator.hyphenateWord(word);
+};
+
+export const tokenizeParagraphs = (text: string): string[] => {
+  return splitParagraphs(text).map((tok) => tok[0]);
+};
diff --git a/agents/src/tokenize/basic/hyphenator.ts b/agents/src/tokenize/basic/hyphenator.ts
diff --git a/agents/src/tokenize/basic/index.ts b/agents/src/tokenize/basic/index.ts
@@ -0,0 +1,5 @@
+// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+export { SentenceTokenizer, WordTokenizer, tokenizeParagraphs, hyphenateWord } from './basic.js';
diff --git a/agents/src/tokenize/basic/paragraph.ts b/agents/src/tokenize/basic/paragraph.ts
@@ -0,0 +1,43 @@
+// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Split the text into paragraphs.
+ */
+export const splitParagraphs = (text: string): [string, number, number][] => {
+  const re = /\n\s*\n/g;
+  const splits = Array.from(text.matchAll(re));
+
+  const paragraphs: [string, number, number][] = [];
+  let start = 0;
+
+  // no splits (single paragraph)
+  if (splits.length === 0) {
+    const stripped = text.trim();
+    if (!stripped) return paragraphs;
+
+    const start = text.indexOf(stripped);
+    return [[stripped, start, start + stripped.length]];
+  }
+
+  for (const split of splits) {
+    const end = split.index!;
+    const paragraph = text.slice(start, end).trim();
+    if (paragraph) {
+      const paragraphStart = start + text.slice(start, end).indexOf(paragraph);
+      const paragraphEnd = paragraphStart + paragraph.length;
+      paragraphs.push([paragraph, paragraphStart, paragraphEnd]);
+    }
+    start = end + split[0].length;
+  }
+
+  const lastParagraph = text.slice(start).trim();
+  if (lastParagraph) {
+    const paragraphStart = start + text.slice(start).indexOf(lastParagraph);
+    const paragraphEnd = paragraphStart + lastParagraph.length;
+    paragraphs.push([lastParagraph, paragraphStart, paragraphEnd]);
+  }
+
+  return paragraphs;
+};
diff --git a/agents/src/tokenize/basic/sentence.ts b/agents/src/tokenize/basic/sentence.ts
@@ -0,0 +1,69 @@
+// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Split the text into sentences.
+ */
+export const splitSentences = (text: string, minLength = 20): [string, number, number][] => {
+  const alphabets = /([A-Za-z])/g;
+  const prefixes = /(Mr|St|Mrs|Ms|Dr)[.]/g;
+  const suffixes = /(Inc|Ltd|Jr|Sr|Co)/g;
+  const starters =
+    /(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)/g;
+  const acronyms = /([A-Z][.][A-Z][.](?:[A-Z][.])?)/g;
+  const websites = /[.](com|net|org|io|gov|edu|me)/g;
+  const digits = /([0-9])/g;
+  const dots = /\.{2,}/g;
+
+  text = text.replaceAll('\n', ' ');
+  text = text.replaceAll(prefixes, '$1<prd>');
+  text = text.replaceAll(websites, '<prd>$2');
+  text = text.replaceAll(new RegExp(`${digits}[.]${digits}`, 'g'), '$1<prd>$2');
+  text = text.replaceAll(dots, (match) => '<prd>'.repeat(match.length));
+  text = text.replaceAll('Ph.D.', 'Ph<prd>D<prd>');
+  text = text.replaceAll(new RegExp(`\s${alphabets}[.]`, 'g'), '$1<prd>');
+  text = text.replaceAll(new RegExp(`${acronyms} ${starters}`, 'g'), '$1<stop> $2');
+  text = text.replaceAll(
+    new RegExp(`${alphabets}[.]${alphabets}[.]${alphabets}[.]`, 'g'),
+    '$1<prd>$2<prd>$3<prd>',
+  );
+  text = text.replaceAll(new RegExp(`${alphabets}[.]${alphabets}[.]`, 'g'), '$1<prd>$2<prd>');
+  text = text.replaceAll(new RegExp(` ${suffixes}[.] ${starters}`, 'g'), '$1<stop> $2');
+  text = text.replaceAll(new RegExp(` ${suffixes}[.]`, 'g'), '$1<prd>');
+  text = text.replaceAll(new RegExp(` ${alphabets}[.]`, 'g'), '$1<prd>');
+  text = text.replaceAll('.”', '”.');
+  text = text.replaceAll('."', '".');
+  text = text.replaceAll('!"', '"!');
+  text = text.replaceAll('?"', '"?');
+  text = text.replaceAll('.', '.<stop>');
+  text = text.replaceAll('?', '?<stop>');
+  text = text.replaceAll('!', '!<stop>');
+  text = text.replaceAll('<prd>', '.');
+
+  const split = text.split('<stop>');
+  text = text.replaceAll('<stop>', '');
+
+  const sentences: [string, number, number][] = [];
+  let buf = '';
+  let start = 0;
+  let end = 0;
+  for (const match of split) {
+    const sentence = match.trim();
+    if (!sentence) continue;
+
+    buf += ' ' + sentence;
+    end += match.length;
+    if (buf.length > minLength) {
+      sentences.push([buf.slice(1), start, end]);
+      start = end;
+      buf = '';
+    }
+  }
+
+  if (buf) {
+    sentences.push([buf.slice(1), start, text.length - 1]);
+  }
+
+  return sentences;
+};
diff --git a/agents/src/tokenize/basic/word.ts b/agents/src/tokenize/basic/word.ts
@@ -0,0 +1,27 @@
+// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+import { PUNCTUATIONS } from '../tokenizer.js';
+
+/**
+ * Split the text into words.
+ */
+export const splitWords = (text: string, ignorePunctuation = true): [string, number, number][] => {
+  const re = /\S+/g;
+  const words: [string, number, number][] = [];
+
+  let arr;
+  while ((arr = re.exec(text)) !== null) {
+    let word = arr[0];
+    const start = arr.index;
+    const end = start + word.length;
+
+    if (ignorePunctuation) {
+      word = word.replace(new RegExp(`[${PUNCTUATIONS.join('')}]`, 'g'), '');
+    }
+
+    words.push([word, start, end]);
+  }
+
+  return words;
+};
diff --git a/agents/src/tokenize/index.ts b/agents/src/tokenize/index.ts
@@ -0,0 +1,16 @@
+// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+import * as basic from './basic/index.js';
+
+export {
+  type TokenData,
+  SentenceTokenizer,
+  SentenceStream,
+  WordTokenizer,
+  WordStream,
+} from './tokenizer.js';
+
+export { BufferedSentenceStream, BufferedTokenStream } from './token_stream.js';
+
+export { basic };