-
Notifications
You must be signed in to change notification settings - Fork 17
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
fix: Tokenization performance #2213
Changes from all commits
832400f
5db40a8
2aa177a
6a6c218
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -63,7 +63,7 @@ describe('readAppMapContent', () => { | |
expect(content).toContain('sql'); | ||
expect(content).toContain('database'); | ||
|
||
expect(content.split(' ')).toEqual([ | ||
expect(content.split(/\s+/)).toEqual([ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The content is no longer joined by spaces, so splitting on whitespace restores the previous behavior. This is functionally the same as how we're splitting words in tokenize, even though the regex is different. |
||
'Test', | ||
'AppMap', | ||
'test', | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -47,6 +47,7 @@ export async function langchainSplitter(content: string, fileExtension: string): | |
splitter = new RecursiveCharacterTextSplitter(); | ||
} | ||
const documents = await splitter.createDocuments([content]); | ||
const contentLines = content.split('\n'); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ouch! :/ Good catch, thanks! |
||
|
||
// metadata includes: | ||
// { loc: { lines: { from: 1, to: 14 } } } | ||
|
@@ -58,7 +59,6 @@ export async function langchainSplitter(content: string, fileExtension: string): | |
content: '', | ||
}; | ||
if (lines) { | ||
const contentLines = content.split('\n'); | ||
result.content = contentLines.slice(lines.from - 1, lines.to).join('\n'); | ||
result.startLine = lines.from; | ||
result.endLine = lines.to; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -59,23 +59,55 @@ export function words(content: string): string[] { | |
return content.match(/\b\w+\b/g) ?? []; | ||
} | ||
|
||
/** | ||
* Prepares a string for tokenization by splitting it into batches of lines, each of which is | ||
* no longer than the specified maximum length. | ||
* | ||
* @param content The content to split into batches | ||
* @param batchSize The maximum number of characters per batch | ||
* @param maxLineLength The maximum length of a line | ||
* @returns an array of batches of content | ||
*/ | ||
export function batch(content: string, batchSize = 1000, maxLineLength = 1000): string[] { | ||
const lines = content.split('\n').filter(({ length }) => length <= maxLineLength); | ||
const result = []; | ||
for (let i = 0; i < lines.length; i += batchSize) { | ||
result.push(lines.slice(i, i + batchSize).join('\n')); | ||
} | ||
|
||
return result; | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Have you considered implementing this as an async generator instead? IMO it'll make the flow clearer and you won't have to muck with the event loop directly. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd considered it, but I believe I'd still need to be awaiting There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you're generating batches asynchronously then it goes through the event loop, so you don't need async *batch(): AsyncGenerator<string> {
for (...) yield chunk;
}
for await (const chunk of batch())
doSomething(chunk); |
||
|
||
type FileTokens = { | ||
symbols: string[]; | ||
words: string[]; | ||
}; | ||
|
||
export function fileTokens( | ||
export async function fileTokens( | ||
content: string, | ||
fileExtension: string, | ||
enableGenericSymbolParsing = true | ||
): FileTokens { | ||
): Promise<FileTokens> { | ||
if (enableGenericSymbolParsing) | ||
debug('Using generic symbol parsing for file extension: %s', fileExtension); | ||
|
||
const symbolList = queryKeywords( | ||
symbols(content, fileExtension, enableGenericSymbolParsing) | ||
).sort(); | ||
const wordList = queryKeywords(words(content)).sort(); | ||
const batches = batch(content); | ||
const symbolList: string[] = []; | ||
const wordList: string[] = []; | ||
|
||
for (let i = 0; i < batches.length; ++i) { | ||
if (i && i % 5 === 0) { | ||
// Every 5th batch, wait for the next tick to avoid blocking the event loop | ||
await new Promise((resolve) => setImmediate(resolve)); | ||
} | ||
|
||
const batch = batches[i]; | ||
symbolList.push(...queryKeywords(symbols(batch, fileExtension, enableGenericSymbolParsing))); | ||
wordList.push(...queryKeywords(words(batch))); | ||
} | ||
|
||
symbolList.sort(); | ||
wordList.sort(); | ||
|
||
// Iterate through words, with a corresponding pointer to symbols. | ||
// If the word at the word index does not match the symbol at the symbol index, | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
My assessment here is that changing this whitespace is inconsequential, except in the scenario described in the comment.