Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Tokenization performance #2213

Merged
merged 4 commits into from
Jan 22, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion packages/cli/src/rpc/explain/index/appmap-index.ts
Original file line number Diff line number Diff line change
@@ -126,7 +126,9 @@ export async function readAppMapContent(appmapFile: string): Promise<string> {
appmapWords.push(...parameters);
appmapWords.push(...types);

return appmapWords.join(' ');
// Words are separated by new lines to reduce the time spent tokenizing the content.
// Long lines may be slower to tokenize or be skipped altogether.
return appmapWords.join('\n');
Comment on lines +129 to +131
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My assessment here is that changing this whitespace is inconsequential, except in the scenario described in the comment.

}

export function trueFilter(): Promise<boolean> {
Original file line number Diff line number Diff line change
@@ -63,7 +63,7 @@ describe('readAppMapContent', () => {
expect(content).toContain('sql');
expect(content).toContain('database');

expect(content.split(' ')).toEqual([
expect(content.split(/\s+/)).toEqual([
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The content is no longer joined by spaces, so splitting on whitespace restores the previous behavior. This is functionally the same as how we're splitting words in tokenize, even though the regex is different.

'Test',
'AppMap',
'test',
5 changes: 3 additions & 2 deletions packages/search/src/build-file-index.ts
Original file line number Diff line number Diff line change
@@ -15,7 +15,7 @@ const debug = makeDebug('appmap:search:build-index');
export type Tokenizer = (
content: string,
fileExtension: string
) => { symbols: string[]; words: string[] };
) => Promise<{ symbols: string[]; words: string[] }>;

type Context = {
fileIndex: FileIndex;
@@ -37,7 +37,8 @@ async function indexFile(context: Context, filePath: string) {
fileContents.length,
fileContents.slice(0, 40)
);
const tokens = context.tokenizer(fileContents, filePath);
const fileExtension = filePath.split('.').pop() ?? '';
const tokens = await context.tokenizer(fileContents, fileExtension);
const symbols = tokens.symbols.join(' ');
const words = tokens.words.join(' ');

10 changes: 6 additions & 4 deletions packages/search/src/build-snippet-index.ts
Original file line number Diff line number Diff line change
@@ -25,17 +25,19 @@ async function indexFile(context: Context, file: File) {
const extension = file.filePath.split('.').pop() || '';
const chunks = await context.splitter(fileContent, extension);

chunks.forEach((chunk) => {
for (const chunk of chunks) {
const { content, startLine } = chunk;
const snippetId = fileChunkSnippetId(filePath, startLine);
const fileExtension = file.filePath.split('.').pop() ?? '';
const { symbols, words } = await context.tokenizer(content, fileExtension);
context.snippetIndex.indexSnippet(
snippetId,
file.directory,
context.tokenizer(content, file.filePath).symbols.join(' '),
context.tokenizer(content, file.filePath).words.join(' '),
symbols.join(' '),
words.join(' '),
content
);
});
}
}

export default async function buildSnippetIndex(
2 changes: 1 addition & 1 deletion packages/search/src/splitter.ts
Original file line number Diff line number Diff line change
@@ -47,6 +47,7 @@ export async function langchainSplitter(content: string, fileExtension: string):
splitter = new RecursiveCharacterTextSplitter();
}
const documents = await splitter.createDocuments([content]);
const contentLines = content.split('\n');
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ouch! :/ Good catch, thanks!


// metadata includes:
// { loc: { lines: { from: 1, to: 14 } } }
@@ -58,7 +59,6 @@ export async function langchainSplitter(content: string, fileExtension: string):
content: '',
};
if (lines) {
const contentLines = content.split('\n');
result.content = contentLines.slice(lines.from - 1, lines.to).join('\n');
result.startLine = lines.from;
result.endLine = lines.to;
44 changes: 38 additions & 6 deletions packages/search/src/tokenize.ts
Original file line number Diff line number Diff line change
@@ -59,23 +59,55 @@ export function words(content: string): string[] {
return content.match(/\b\w+\b/g) ?? [];
}

/**
* Prepares a string for tokenization by splitting it into batches of lines, each of which is
* no longer than the specified maximum length.
*
* @param content The content to split into batches
* @param batchSize The maximum number of characters per batch
* @param maxLineLength The maximum length of a line
* @returns an array of batches of content
*/
export function batch(content: string, batchSize = 1000, maxLineLength = 1000): string[] {
const lines = content.split('\n').filter(({ length }) => length <= maxLineLength);
const result = [];
for (let i = 0; i < lines.length; i += batchSize) {
result.push(lines.slice(i, i + batchSize).join('\n'));
}

return result;
}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Have you considered implementing this as an async generator instead? IMO it'll make the flow clearer and you won't have to muck with the event loop directly.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd considered it, but I believe I'd still need to be awaiting setImmediate to break up the synchronous execution, no? I don't find the async generator API to be all that great to be honest (though, the iterator helpers in Node 22 do seem quite nice).

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you're generating batches asynchronously then it goes through the event loop, so you don't need setImmediate. Ie.

async *batch(): AsyncGenerator<string> {
  for (...) yield chunk;
}

for await (const chunk of batch())
  doSomething(chunk);


type FileTokens = {
symbols: string[];
words: string[];
};

export function fileTokens(
export async function fileTokens(
content: string,
fileExtension: string,
enableGenericSymbolParsing = true
): FileTokens {
): Promise<FileTokens> {
if (enableGenericSymbolParsing)
debug('Using generic symbol parsing for file extension: %s', fileExtension);

const symbolList = queryKeywords(
symbols(content, fileExtension, enableGenericSymbolParsing)
).sort();
const wordList = queryKeywords(words(content)).sort();
const batches = batch(content);
const symbolList: string[] = [];
const wordList: string[] = [];

for (let i = 0; i < batches.length; ++i) {
if (i && i % 5 === 0) {
// Every 5th batch, wait for the next tick to avoid blocking the event loop
await new Promise((resolve) => setImmediate(resolve));
}

const batch = batches[i];
symbolList.push(...queryKeywords(symbols(batch, fileExtension, enableGenericSymbolParsing)));
wordList.push(...queryKeywords(words(batch)));
}

symbolList.sort();
wordList.sort();

// Iterate through words, with a corresponding pointer to symbols.
// If the word at the word index does not match the symbol at the symbol index,
43 changes: 40 additions & 3 deletions packages/search/test/tokenize.spec.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { symbols, words, fileTokens } from '../src/tokenize';
import { symbols, words, fileTokens, batch } from '../src/tokenize';

describe('FileTokens', () => {
const content = `
@@ -29,9 +29,46 @@ describe('FileTokens', () => {
]);
});

it('should extract file tokens (symbols and words) from the content', () => {
const result = fileTokens(content, fileExtension);
it('should extract file tokens (symbols and words) from the content', async () => {
const result = await fileTokens(content, fileExtension);
expect(result.symbols).toEqual(['example', 'method1', 'method2']);
expect(result.words).toEqual(['class', 'public', 'public', 'void', 'void']);
});

it('yields to the event loop periodically', async () => {
const result = fileTokens('aaa\n'.repeat(10000), 'example');
const winner = await Promise.race([
result,
new Promise<'pass'>((resolve) => setImmediate(() => resolve('pass'))),
]);
expect(winner).toStrictEqual('pass');
await expect(result).resolves.toBeDefined();
});

describe('batch', () => {
it('should split the content into batches of lines', () => {
const a = 'aaa\n'.repeat(100);
const b = 'bbb\n'.repeat(100);
const c = 'ccc\n'.repeat(50);
const content = a + b + c;
const result = batch(content, 100);
expect(result).toStrictEqual([a.slice(0, -1), b.slice(0, -1), c]);
});

it('returns text in batches which can be joined by new line to reconstruct the original text', () => {
const a = 'aaa\n'.repeat(100);
const b = 'bbb\n'.repeat(100);
const c = 'ccc\n'.repeat(50);
const content = a + b + c;
const result = batch(content, 100);
const joined = result.join('\n');
expect(joined).toEqual(content);
});

it('filters out lines longer than the max line length', () => {
const maxLen = 10;
const result = batch(`${'a'.repeat(maxLen + 1)}\nb\n${'c'.repeat(maxLen)}`, 100, maxLen);
expect(result).toStrictEqual([`b\n${'c'.repeat(maxLen)}`]);
});
});
});
Loading