From 832400f982aed82f74fe756ca76bbcb7c3893b5d Mon Sep 17 00:00:00 2001 From: Dustin Byrne Date: Tue, 21 Jan 2025 17:12:41 -0500 Subject: [PATCH 1/4] fix: Performance issue when chunking large documents The content would be split for each chunk, causing the process to hang if the file was large enough. --- packages/search/src/splitter.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/search/src/splitter.ts b/packages/search/src/splitter.ts index a71955128d..ef3f5aa33e 100644 --- a/packages/search/src/splitter.ts +++ b/packages/search/src/splitter.ts @@ -47,6 +47,7 @@ export async function langchainSplitter(content: string, fileExtension: string): splitter = new RecursiveCharacterTextSplitter(); } const documents = await splitter.createDocuments([content]); + const contentLines = content.split('\n'); // metadata includes: // { loc: { lines: { from: 1, to: 14 } } } @@ -58,7 +59,6 @@ export async function langchainSplitter(content: string, fileExtension: string): content: '', }; if (lines) { - const contentLines = content.split('\n'); result.content = contentLines.slice(lines.from - 1, lines.to).join('\n'); result.startLine = lines.from; result.endLine = lines.to; From 5db40a8377b483f0a3e26654950d91b1ebaed23d Mon Sep 17 00:00:00 2001 From: Dustin Byrne Date: Tue, 21 Jan 2025 17:14:50 -0500 Subject: [PATCH 2/4] fix: Prevent re-tokenization of chunks --- packages/search/src/build-snippet-index.ts | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/packages/search/src/build-snippet-index.ts b/packages/search/src/build-snippet-index.ts index 7e86ef65e0..67831769e8 100644 --- a/packages/search/src/build-snippet-index.ts +++ b/packages/search/src/build-snippet-index.ts @@ -28,11 +28,12 @@ async function indexFile(context: Context, file: File) { chunks.forEach((chunk) => { const { content, startLine } = chunk; const snippetId = fileChunkSnippetId(filePath, startLine); + const { symbols, words } = context.tokenizer(content, file.filePath); context.snippetIndex.indexSnippet( snippetId, file.directory, - context.tokenizer(content, file.filePath).symbols.join(' '), - context.tokenizer(content, file.filePath).words.join(' '), + symbols.join(' '), + words.join(' '), content ); }); From 2aa177aa46a8d10ce5e4fd72e2c0002ad11ed448 Mon Sep 17 00:00:00 2001 From: Dustin Byrne Date: Wed, 22 Jan 2025 13:08:12 -0500 Subject: [PATCH 3/4] fix: Tokenization will consider the file type --- packages/search/src/build-file-index.ts | 3 ++- packages/search/src/build-snippet-index.ts | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/packages/search/src/build-file-index.ts b/packages/search/src/build-file-index.ts index 377f6f4fad..72f7ddb0b1 100644 --- a/packages/search/src/build-file-index.ts +++ b/packages/search/src/build-file-index.ts @@ -37,7 +37,8 @@ async function indexFile(context: Context, filePath: string) { fileContents.length, fileContents.slice(0, 40) ); - const tokens = context.tokenizer(fileContents, filePath); + const fileExtension = filePath.split('.').pop() ?? ''; + const tokens = context.tokenizer(fileContents, fileExtension); const symbols = tokens.symbols.join(' '); const words = tokens.words.join(' '); diff --git a/packages/search/src/build-snippet-index.ts b/packages/search/src/build-snippet-index.ts index 67831769e8..67fd3f75cb 100644 --- a/packages/search/src/build-snippet-index.ts +++ b/packages/search/src/build-snippet-index.ts @@ -28,7 +28,8 @@ async function indexFile(context: Context, file: File) { chunks.forEach((chunk) => { const { content, startLine } = chunk; const snippetId = fileChunkSnippetId(filePath, startLine); - const { symbols, words } = context.tokenizer(content, file.filePath); + const fileExtension = file.filePath.split('.').pop() ?? ''; + const { symbols, words } = context.tokenizer(content, fileExtension); context.snippetIndex.indexSnippet( snippetId, file.directory, From 6a6c2184ae23d6f72fbfcbb1d957ab67bed6a1b9 Mon Sep 17 00:00:00 2001 From: Dustin Byrne Date: Tue, 21 Jan 2025 17:19:32 -0500 Subject: [PATCH 4/4] fix: Tokenization no longer hangs the process Tokenization is now performed in batches and will occasionally yield back to the main event loop instead of blocking the main thread indefinitely. --- .../cli/src/rpc/explain/index/appmap-index.ts | 4 +- .../rpc/explain/index/appmap-index.spec.ts | 2 +- packages/search/src/build-file-index.ts | 4 +- packages/search/src/build-snippet-index.ts | 6 +-- packages/search/src/tokenize.ts | 44 ++++++++++++++++--- packages/search/test/tokenize.spec.ts | 43 ++++++++++++++++-- 6 files changed, 87 insertions(+), 16 deletions(-) diff --git a/packages/cli/src/rpc/explain/index/appmap-index.ts b/packages/cli/src/rpc/explain/index/appmap-index.ts index 17ff308fc7..8a1eee4e99 100644 --- a/packages/cli/src/rpc/explain/index/appmap-index.ts +++ b/packages/cli/src/rpc/explain/index/appmap-index.ts @@ -126,7 +126,9 @@ export async function readAppMapContent(appmapFile: string): Promise { appmapWords.push(...parameters); appmapWords.push(...types); - return appmapWords.join(' '); + // Words are separated by new lines to reduce the time spent tokenizing the content. + // Long lines may be slower to tokenize or be skipped altogether. + return appmapWords.join('\n'); } export function trueFilter(): Promise { diff --git a/packages/cli/tests/unit/rpc/explain/index/appmap-index.spec.ts b/packages/cli/tests/unit/rpc/explain/index/appmap-index.spec.ts index b51c47be4b..9746c8a558 100644 --- a/packages/cli/tests/unit/rpc/explain/index/appmap-index.spec.ts +++ b/packages/cli/tests/unit/rpc/explain/index/appmap-index.spec.ts @@ -63,7 +63,7 @@ describe('readAppMapContent', () => { expect(content).toContain('sql'); expect(content).toContain('database'); - expect(content.split(' ')).toEqual([ + expect(content.split(/\s+/)).toEqual([ 'Test', 'AppMap', 'test', diff --git a/packages/search/src/build-file-index.ts b/packages/search/src/build-file-index.ts index 72f7ddb0b1..bfb59113ff 100644 --- a/packages/search/src/build-file-index.ts +++ b/packages/search/src/build-file-index.ts @@ -15,7 +15,7 @@ const debug = makeDebug('appmap:search:build-index'); export type Tokenizer = ( content: string, fileExtension: string -) => { symbols: string[]; words: string[] }; +) => Promise<{ symbols: string[]; words: string[] }>; type Context = { fileIndex: FileIndex; @@ -38,7 +38,7 @@ async function indexFile(context: Context, filePath: string) { fileContents.slice(0, 40) ); const fileExtension = filePath.split('.').pop() ?? ''; - const tokens = context.tokenizer(fileContents, fileExtension); + const tokens = await context.tokenizer(fileContents, fileExtension); const symbols = tokens.symbols.join(' '); const words = tokens.words.join(' '); diff --git a/packages/search/src/build-snippet-index.ts b/packages/search/src/build-snippet-index.ts index 67fd3f75cb..d9d5dae43f 100644 --- a/packages/search/src/build-snippet-index.ts +++ b/packages/search/src/build-snippet-index.ts @@ -25,11 +25,11 @@ async function indexFile(context: Context, file: File) { const extension = file.filePath.split('.').pop() || ''; const chunks = await context.splitter(fileContent, extension); - chunks.forEach((chunk) => { + for (const chunk of chunks) { const { content, startLine } = chunk; const snippetId = fileChunkSnippetId(filePath, startLine); const fileExtension = file.filePath.split('.').pop() ?? ''; - const { symbols, words } = context.tokenizer(content, fileExtension); + const { symbols, words } = await context.tokenizer(content, fileExtension); context.snippetIndex.indexSnippet( snippetId, file.directory, @@ -37,7 +37,7 @@ async function indexFile(context: Context, file: File) { words.join(' '), content ); - }); + } } export default async function buildSnippetIndex( diff --git a/packages/search/src/tokenize.ts b/packages/search/src/tokenize.ts index caf9a49e14..0a9e2b78bf 100644 --- a/packages/search/src/tokenize.ts +++ b/packages/search/src/tokenize.ts @@ -59,23 +59,55 @@ export function words(content: string): string[] { return content.match(/\b\w+\b/g) ?? []; } +/** + * Prepares a string for tokenization by splitting it into batches of lines, each of which is + * no longer than the specified maximum length. + * + * @param content The content to split into batches + * @param batchSize The maximum number of characters per batch + * @param maxLineLength The maximum length of a line + * @returns an array of batches of content + */ +export function batch(content: string, batchSize = 1000, maxLineLength = 1000): string[] { + const lines = content.split('\n').filter(({ length }) => length <= maxLineLength); + const result = []; + for (let i = 0; i < lines.length; i += batchSize) { + result.push(lines.slice(i, i + batchSize).join('\n')); + } + + return result; +} + type FileTokens = { symbols: string[]; words: string[]; }; -export function fileTokens( +export async function fileTokens( content: string, fileExtension: string, enableGenericSymbolParsing = true -): FileTokens { +): Promise { if (enableGenericSymbolParsing) debug('Using generic symbol parsing for file extension: %s', fileExtension); - const symbolList = queryKeywords( - symbols(content, fileExtension, enableGenericSymbolParsing) - ).sort(); - const wordList = queryKeywords(words(content)).sort(); + const batches = batch(content); + const symbolList: string[] = []; + const wordList: string[] = []; + + for (let i = 0; i < batches.length; ++i) { + if (i && i % 5 === 0) { + // Every 5th batch, wait for the next tick to avoid blocking the event loop + await new Promise((resolve) => setImmediate(resolve)); + } + + const batch = batches[i]; + symbolList.push(...queryKeywords(symbols(batch, fileExtension, enableGenericSymbolParsing))); + wordList.push(...queryKeywords(words(batch))); + } + + symbolList.sort(); + wordList.sort(); // Iterate through words, with a corresponding pointer to symbols. // If the word at the word index does not match the symbol at the symbol index, diff --git a/packages/search/test/tokenize.spec.ts b/packages/search/test/tokenize.spec.ts index 4e32df4438..e61cae8c03 100644 --- a/packages/search/test/tokenize.spec.ts +++ b/packages/search/test/tokenize.spec.ts @@ -1,4 +1,4 @@ -import { symbols, words, fileTokens } from '../src/tokenize'; +import { symbols, words, fileTokens, batch } from '../src/tokenize'; describe('FileTokens', () => { const content = ` @@ -29,9 +29,46 @@ describe('FileTokens', () => { ]); }); - it('should extract file tokens (symbols and words) from the content', () => { - const result = fileTokens(content, fileExtension); + it('should extract file tokens (symbols and words) from the content', async () => { + const result = await fileTokens(content, fileExtension); expect(result.symbols).toEqual(['example', 'method1', 'method2']); expect(result.words).toEqual(['class', 'public', 'public', 'void', 'void']); }); + + it('yields to the event loop periodically', async () => { + const result = fileTokens('aaa\n'.repeat(10000), 'example'); + const winner = await Promise.race([ + result, + new Promise<'pass'>((resolve) => setImmediate(() => resolve('pass'))), + ]); + expect(winner).toStrictEqual('pass'); + await expect(result).resolves.toBeDefined(); + }); + + describe('batch', () => { + it('should split the content into batches of lines', () => { + const a = 'aaa\n'.repeat(100); + const b = 'bbb\n'.repeat(100); + const c = 'ccc\n'.repeat(50); + const content = a + b + c; + const result = batch(content, 100); + expect(result).toStrictEqual([a.slice(0, -1), b.slice(0, -1), c]); + }); + + it('returns text in batches which can be joined by new line to reconstruct the original text', () => { + const a = 'aaa\n'.repeat(100); + const b = 'bbb\n'.repeat(100); + const c = 'ccc\n'.repeat(50); + const content = a + b + c; + const result = batch(content, 100); + const joined = result.join('\n'); + expect(joined).toEqual(content); + }); + + it('filters out lines longer than the max line length', () => { + const maxLen = 10; + const result = batch(`${'a'.repeat(maxLen + 1)}\nb\n${'c'.repeat(maxLen)}`, 100, maxLen); + expect(result).toStrictEqual([`b\n${'c'.repeat(maxLen)}`]); + }); + }); });