-
Notifications
You must be signed in to change notification settings - Fork 27
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(tokenize): add basic tokenizer implementations (#109)
- Loading branch information
Showing
13 changed files
with
974 additions
and
25 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
--- | ||
"@livekit/agents": minor | ||
--- | ||
|
||
add basic tokenizer implementations |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
// SPDX-FileCopyrightText: 2024 LiveKit, Inc. | ||
// | ||
// SPDX-License-Identifier: Apache-2.0 | ||
import * as tokenizer from '../index.js'; | ||
import { BufferedSentenceStream } from '../token_stream.js'; | ||
import { hyphenator } from './hyphenator.js'; | ||
import { splitParagraphs } from './paragraph.js'; | ||
import { splitSentences } from './sentence.js'; | ||
import { splitWords } from './word'; | ||
|
||
interface TokenizerOptions { | ||
language: string; | ||
minSentenceLength: number; | ||
streamContextLength: number; | ||
} | ||
|
||
export class SentenceTokenizer extends tokenizer.SentenceTokenizer { | ||
#config: TokenizerOptions; | ||
|
||
constructor(language = 'en-US', minSentenceLength = 20, streamContextLength = 10) { | ||
super(); | ||
this.#config = { | ||
language, | ||
minSentenceLength, | ||
streamContextLength, | ||
}; | ||
} | ||
|
||
// eslint-disable-next-line @typescript-eslint/no-unused-vars | ||
tokenize(text: string, language?: string): string[] { | ||
return splitSentences(text, this.#config.minSentenceLength).map((tok) => tok[0]); | ||
} | ||
|
||
// eslint-disable-next-line @typescript-eslint/no-unused-vars | ||
stream(language?: string): tokenizer.SentenceStream { | ||
return new BufferedSentenceStream( | ||
(text: string) => splitSentences(text, this.#config.minSentenceLength), | ||
this.#config.minSentenceLength, | ||
this.#config.streamContextLength, | ||
); | ||
} | ||
} | ||
|
||
export class WordTokenizer extends tokenizer.SentenceTokenizer { | ||
#ignorePunctuation: boolean; | ||
|
||
constructor(ignorePunctuation = true) { | ||
super(); | ||
this.#ignorePunctuation = ignorePunctuation; | ||
} | ||
|
||
// eslint-disable-next-line @typescript-eslint/no-unused-vars | ||
tokenize(text: string, language?: string): string[] { | ||
return splitWords(text, this.#ignorePunctuation).map((tok) => tok[0]); | ||
} | ||
|
||
// eslint-disable-next-line @typescript-eslint/no-unused-vars | ||
stream(language?: string): tokenizer.SentenceStream { | ||
return new BufferedSentenceStream( | ||
(text: string) => splitWords(text, this.#ignorePunctuation), | ||
1, | ||
1, | ||
); | ||
} | ||
} | ||
|
||
export const hyphenateWord = (word: string): string[] => { | ||
return hyphenator.hyphenateWord(word); | ||
}; | ||
|
||
export const tokenizeParagraphs = (text: string): string[] => { | ||
return splitParagraphs(text).map((tok) => tok[0]); | ||
}; |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
// SPDX-FileCopyrightText: 2024 LiveKit, Inc. | ||
// | ||
// SPDX-License-Identifier: Apache-2.0 | ||
|
||
export { SentenceTokenizer, WordTokenizer, tokenizeParagraphs, hyphenateWord } from './basic.js'; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
// SPDX-FileCopyrightText: 2024 LiveKit, Inc. | ||
// | ||
// SPDX-License-Identifier: Apache-2.0 | ||
|
||
/** | ||
* Split the text into paragraphs. | ||
*/ | ||
export const splitParagraphs = (text: string): [string, number, number][] => { | ||
const re = /\n\s*\n/g; | ||
const splits = Array.from(text.matchAll(re)); | ||
|
||
const paragraphs: [string, number, number][] = []; | ||
let start = 0; | ||
|
||
// no splits (single paragraph) | ||
if (splits.length === 0) { | ||
const stripped = text.trim(); | ||
if (!stripped) return paragraphs; | ||
|
||
const start = text.indexOf(stripped); | ||
return [[stripped, start, start + stripped.length]]; | ||
} | ||
|
||
for (const split of splits) { | ||
const end = split.index!; | ||
const paragraph = text.slice(start, end).trim(); | ||
if (paragraph) { | ||
const paragraphStart = start + text.slice(start, end).indexOf(paragraph); | ||
const paragraphEnd = paragraphStart + paragraph.length; | ||
paragraphs.push([paragraph, paragraphStart, paragraphEnd]); | ||
} | ||
start = end + split[0].length; | ||
} | ||
|
||
const lastParagraph = text.slice(start).trim(); | ||
if (lastParagraph) { | ||
const paragraphStart = start + text.slice(start).indexOf(lastParagraph); | ||
const paragraphEnd = paragraphStart + lastParagraph.length; | ||
paragraphs.push([lastParagraph, paragraphStart, paragraphEnd]); | ||
} | ||
|
||
return paragraphs; | ||
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
// SPDX-FileCopyrightText: 2024 LiveKit, Inc. | ||
// | ||
// SPDX-License-Identifier: Apache-2.0 | ||
|
||
/** | ||
* Split the text into sentences. | ||
*/ | ||
export const splitSentences = (text: string, minLength = 20): [string, number, number][] => { | ||
const alphabets = /([A-Za-z])/g; | ||
const prefixes = /(Mr|St|Mrs|Ms|Dr)[.]/g; | ||
const suffixes = /(Inc|Ltd|Jr|Sr|Co)/g; | ||
const starters = | ||
/(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)/g; | ||
const acronyms = /([A-Z][.][A-Z][.](?:[A-Z][.])?)/g; | ||
const websites = /[.](com|net|org|io|gov|edu|me)/g; | ||
const digits = /([0-9])/g; | ||
const dots = /\.{2,}/g; | ||
|
||
text = text.replaceAll('\n', ' '); | ||
text = text.replaceAll(prefixes, '$1<prd>'); | ||
text = text.replaceAll(websites, '<prd>$2'); | ||
text = text.replaceAll(new RegExp(`${digits}[.]${digits}`, 'g'), '$1<prd>$2'); | ||
text = text.replaceAll(dots, (match) => '<prd>'.repeat(match.length)); | ||
text = text.replaceAll('Ph.D.', 'Ph<prd>D<prd>'); | ||
text = text.replaceAll(new RegExp(`\s${alphabets}[.]`, 'g'), '$1<prd>'); | ||
text = text.replaceAll(new RegExp(`${acronyms} ${starters}`, 'g'), '$1<stop> $2'); | ||
text = text.replaceAll( | ||
new RegExp(`${alphabets}[.]${alphabets}[.]${alphabets}[.]`, 'g'), | ||
'$1<prd>$2<prd>$3<prd>', | ||
); | ||
text = text.replaceAll(new RegExp(`${alphabets}[.]${alphabets}[.]`, 'g'), '$1<prd>$2<prd>'); | ||
text = text.replaceAll(new RegExp(` ${suffixes}[.] ${starters}`, 'g'), '$1<stop> $2'); | ||
text = text.replaceAll(new RegExp(` ${suffixes}[.]`, 'g'), '$1<prd>'); | ||
text = text.replaceAll(new RegExp(` ${alphabets}[.]`, 'g'), '$1<prd>'); | ||
text = text.replaceAll('.”', '”.'); | ||
text = text.replaceAll('."', '".'); | ||
text = text.replaceAll('!"', '"!'); | ||
text = text.replaceAll('?"', '"?'); | ||
text = text.replaceAll('.', '.<stop>'); | ||
text = text.replaceAll('?', '?<stop>'); | ||
text = text.replaceAll('!', '!<stop>'); | ||
text = text.replaceAll('<prd>', '.'); | ||
|
||
const split = text.split('<stop>'); | ||
text = text.replaceAll('<stop>', ''); | ||
|
||
const sentences: [string, number, number][] = []; | ||
let buf = ''; | ||
let start = 0; | ||
let end = 0; | ||
for (const match of split) { | ||
const sentence = match.trim(); | ||
if (!sentence) continue; | ||
|
||
buf += ' ' + sentence; | ||
end += match.length; | ||
if (buf.length > minLength) { | ||
sentences.push([buf.slice(1), start, end]); | ||
start = end; | ||
buf = ''; | ||
} | ||
} | ||
|
||
if (buf) { | ||
sentences.push([buf.slice(1), start, text.length - 1]); | ||
} | ||
|
||
return sentences; | ||
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
// SPDX-FileCopyrightText: 2024 LiveKit, Inc. | ||
// | ||
// SPDX-License-Identifier: Apache-2.0 | ||
import { PUNCTUATIONS } from '../tokenizer.js'; | ||
|
||
/** | ||
* Split the text into words. | ||
*/ | ||
export const splitWords = (text: string, ignorePunctuation = true): [string, number, number][] => { | ||
const re = /\S+/g; | ||
const words: [string, number, number][] = []; | ||
|
||
let arr; | ||
while ((arr = re.exec(text)) !== null) { | ||
let word = arr[0]; | ||
const start = arr.index; | ||
const end = start + word.length; | ||
|
||
if (ignorePunctuation) { | ||
word = word.replace(new RegExp(`[${PUNCTUATIONS.join('')}]`, 'g'), ''); | ||
} | ||
|
||
words.push([word, start, end]); | ||
} | ||
|
||
return words; | ||
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
// SPDX-FileCopyrightText: 2024 LiveKit, Inc. | ||
// | ||
// SPDX-License-Identifier: Apache-2.0 | ||
import * as basic from './basic/index.js'; | ||
|
||
export { | ||
type TokenData, | ||
SentenceTokenizer, | ||
SentenceStream, | ||
WordTokenizer, | ||
WordStream, | ||
} from './tokenizer.js'; | ||
|
||
export { BufferedSentenceStream, BufferedTokenStream } from './token_stream.js'; | ||
|
||
export { basic }; |
Oops, something went wrong.