fix: Tokenization no longer hangs the process

Tokenization is now performed in batches and will occasionally yield back to the main event loop instead of blocking the main thread indefinitely.
getappmap · Jan 21, 2025 · c548d34 · c548d34
1 parent eb0a840
commit c548d34
Show file tree

Hide file tree

Showing 4 changed files with 86 additions and 14 deletions.
diff --git a/packages/search/src/build-file-index.ts b/packages/search/src/build-file-index.ts
@@ -15,7 +15,7 @@ const debug = makeDebug('appmap:search:build-index');
 export type Tokenizer = (
   content: string,
   fileExtension: string
-) => { symbols: string[]; words: string[] };
+) => Promise<{ symbols: string[]; words: string[] }>;
 
 type Context = {
   fileIndex: FileIndex;
@@ -38,7 +38,7 @@ async function indexFile(context: Context, filePath: string) {
     fileContents.slice(0, 40)
   );
   const fileExtension = filePath.split('.').pop() ?? '';
-  const tokens = context.tokenizer(fileContents, fileExtension);
+  const tokens = await context.tokenizer(fileContents, fileExtension);
   const symbols = tokens.symbols.join(' ');
   const words = tokens.words.join(' ');
 

diff --git a/packages/search/src/build-snippet-index.ts b/packages/search/src/build-snippet-index.ts
@@ -25,19 +25,19 @@ async function indexFile(context: Context, file: File) {
   const extension = file.filePath.split('.').pop() || '';
   const chunks = await context.splitter(fileContent, extension);
 
-  chunks.forEach((chunk) => {
+  for (const chunk of chunks) {
     const { content, startLine } = chunk;
     const snippetId = fileChunkSnippetId(filePath, startLine);
     const fileExtension = file.filePath.split('.').pop() ?? '';
-    const { symbols, words } = context.tokenizer(content, fileExtension);
+    const { symbols, words } = await context.tokenizer(content, fileExtension);
     context.snippetIndex.indexSnippet(
       snippetId,
       file.directory,
       symbols.join(' '),
       words.join(' '),
       content
     );
-  });
+  }
 }
 
 export default async function buildSnippetIndex(

diff --git a/packages/search/src/tokenize.ts b/packages/search/src/tokenize.ts
@@ -1,3 +1,4 @@
+import { resolve } from 'node:path';
 import queryKeywords from './query-keywords';
 import makeDebug from 'debug';
 
@@ -59,23 +60,57 @@ export function words(content: string): string[] {
   return content.match(/\b\w+\b/g) ?? [];
 }
 
+/**
+ * Prepares a string for tokenization by splitting it into batches of lines, each of which is
+ * no longer than the specified maximum length.
+ *
+ * @param content The content to split into batches
+ * @param batchSize The maximum number of characters per batch
+ * @param maxLineLength The maximum length of a line
+ * @returns an array of batches of content
+ */
+export function batch(content: string, batchSize = 1000, maxLineLength = 1000): string[] {
+  const lines = content.split('\n').filter(({ length }) => length <= maxLineLength);
+  const result = [];
+  for (let i = 0; i < lines.length; i += batchSize) {
+    result.push(lines.slice(i, i + batchSize).join('\n'));
+  }
+
+  return result;
+}
+
 type FileTokens = {
   symbols: string[];
   words: string[];
 };
 
-export function fileTokens(
+export async function fileTokens(
   content: string,
   fileExtension: string,
   enableGenericSymbolParsing = true
-): FileTokens {
+): Promise<FileTokens> {
   if (enableGenericSymbolParsing)
     debug('Using generic symbol parsing for file extension: %s', fileExtension);
 
-  const symbolList = queryKeywords(
-    symbols(content, fileExtension, enableGenericSymbolParsing)
-  ).sort();
-  const wordList = queryKeywords(words(content)).sort();
+  const batches = batch(content);
+  const symbolList: string[] = [];
+  const wordList: string[] = [];
+
+  console.log(`batch size: ${batches.length}`);
+  for (let i = 0; i < batches.length; ++i) {
+    if (i && i % 5 === 0) {
+      // Every 5th batch, wait for the next tick to avoid blocking the event loop
+      await new Promise((resolve) => setImmediate(resolve));
+      console.log(`yield ${i}`);
+    }
+
+    const batch = batches[i];
+    symbolList.push(...queryKeywords(symbols(batch, fileExtension, enableGenericSymbolParsing)));
+    wordList.push(...queryKeywords(words(batch)));
+  }
+
+  symbolList.sort();
+  wordList.sort();
 
   // Iterate through words, with a corresponding pointer to symbols.
   // If the word at the word index does not match the symbol at the symbol index,

diff --git a/packages/search/test/tokenize.spec.ts b/packages/search/test/tokenize.spec.ts
@@ -1,4 +1,4 @@
-import { symbols, words, fileTokens } from '../src/tokenize';
+import { symbols, words, fileTokens, batch } from '../src/tokenize';
 
 describe('FileTokens', () => {
   const content = `
@@ -29,9 +29,46 @@ describe('FileTokens', () => {
     ]);
   });
 
-  it('should extract file tokens (symbols and words) from the content', () => {
-    const result = fileTokens(content, fileExtension);
+  it('should extract file tokens (symbols and words) from the content', async () => {
+    const result = await fileTokens(content, fileExtension);
     expect(result.symbols).toEqual(['example', 'method1', 'method2']);
     expect(result.words).toEqual(['class', 'public', 'public', 'void', 'void']);
   });
+
+  it('yields to the event loop periodically', async () => {
+    const result = fileTokens('aaa\n'.repeat(10000), 'example');
+    const winner = await Promise.race([
+      result,
+      new Promise<'pass'>((resolve) => setImmediate(() => resolve('pass'))),
+    ]);
+    expect(winner).toStrictEqual('pass');
+    await expect(result).resolves.toBeDefined();
+  });
+
+  describe('batch', () => {
+    it('should split the content into batches of lines', () => {
+      const a = 'aaa\n'.repeat(100);
+      const b = 'bbb\n'.repeat(100);
+      const c = 'ccc\n'.repeat(50);
+      const content = a + b + c;
+      const result = batch(content, 100);
+      expect(result).toStrictEqual([a.slice(0, -1), b.slice(0, -1), c]);
+    });
+
+    it('returns text in batches which can be joined by new line to reconstruct the original text', () => {
+      const a = 'aaa\n'.repeat(100);
+      const b = 'bbb\n'.repeat(100);
+      const c = 'ccc\n'.repeat(50);
+      const content = a + b + c;
+      const result = batch(content, 100);
+      const joined = result.join('\n');
+      expect(joined).toEqual(content);
+    });
+
+    it('filters out lines longer than the max line length', () => {
+      const maxLen = 10;
+      const result = batch(`${'a'.repeat(maxLen + 1)}\nb\n${'c'.repeat(maxLen)}`, 100, maxLen);
+      expect(result).toStrictEqual([`b\n${'c'.repeat(maxLen)}`]);
+    });
+  });
 });