getappmap · dustinbyrne · Jan 22, 2025 · Jan 21, 2025 · Jan 21, 2025 · Jan 22, 2025
diff --git a/packages/cli/src/rpc/explain/index/appmap-index.ts b/packages/cli/src/rpc/explain/index/appmap-index.ts
@@ -126,7 +126,9 @@ export async function readAppMapContent(appmapFile: string): Promise<string> {
   appmapWords.push(...parameters);
   appmapWords.push(...types);
 
-  return appmapWords.join(' ');
+  // Words are separated by new lines to reduce the time spent tokenizing the content.
+  // Long lines may be slower to tokenize or be skipped altogether.
+  return appmapWords.join('\n');
 }
 
 export function trueFilter(): Promise<boolean> {

diff --git a/packages/cli/tests/unit/rpc/explain/index/appmap-index.spec.ts b/packages/cli/tests/unit/rpc/explain/index/appmap-index.spec.ts
@@ -63,7 +63,7 @@ describe('readAppMapContent', () => {
     expect(content).toContain('sql');
     expect(content).toContain('database');
 
-    expect(content.split(' ')).toEqual([
+    expect(content.split(/\s+/)).toEqual([
       'Test',
       'AppMap',
       'test',

diff --git a/packages/search/src/build-file-index.ts b/packages/search/src/build-file-index.ts
@@ -15,7 +15,7 @@ const debug = makeDebug('appmap:search:build-index');
 export type Tokenizer = (
   content: string,
   fileExtension: string
-) => { symbols: string[]; words: string[] };
+) => Promise<{ symbols: string[]; words: string[] }>;
 
 type Context = {
   fileIndex: FileIndex;
@@ -37,7 +37,8 @@ async function indexFile(context: Context, filePath: string) {
     fileContents.length,
     fileContents.slice(0, 40)
   );
-  const tokens = context.tokenizer(fileContents, filePath);
+  const fileExtension = filePath.split('.').pop() ?? '';
+  const tokens = await context.tokenizer(fileContents, fileExtension);
   const symbols = tokens.symbols.join(' ');
   const words = tokens.words.join(' ');
 

diff --git a/packages/search/src/build-snippet-index.ts b/packages/search/src/build-snippet-index.ts
@@ -25,17 +25,19 @@ async function indexFile(context: Context, file: File) {
   const extension = file.filePath.split('.').pop() || '';
   const chunks = await context.splitter(fileContent, extension);
 
-  chunks.forEach((chunk) => {
+  for (const chunk of chunks) {
     const { content, startLine } = chunk;
     const snippetId = fileChunkSnippetId(filePath, startLine);
+    const fileExtension = file.filePath.split('.').pop() ?? '';
+    const { symbols, words } = await context.tokenizer(content, fileExtension);
     context.snippetIndex.indexSnippet(
       snippetId,
       file.directory,
-      context.tokenizer(content, file.filePath).symbols.join(' '),
-      context.tokenizer(content, file.filePath).words.join(' '),
+      symbols.join(' '),
+      words.join(' '),
       content
     );
-  });
+  }
 }
 
 export default async function buildSnippetIndex(

diff --git a/packages/search/src/splitter.ts b/packages/search/src/splitter.ts
@@ -47,6 +47,7 @@ export async function langchainSplitter(content: string, fileExtension: string):
     splitter = new RecursiveCharacterTextSplitter();
   }
   const documents = await splitter.createDocuments([content]);
+  const contentLines = content.split('\n');
 
   // metadata includes:
   // { loc: { lines: { from: 1, to: 14 } } }
@@ -58,7 +59,6 @@ export async function langchainSplitter(content: string, fileExtension: string):
       content: '',
     };
     if (lines) {
-      const contentLines = content.split('\n');
       result.content = contentLines.slice(lines.from - 1, lines.to).join('\n');
       result.startLine = lines.from;
       result.endLine = lines.to;

diff --git a/packages/search/src/tokenize.ts b/packages/search/src/tokenize.ts
@@ -59,23 +59,55 @@ export function words(content: string): string[] {
   return content.match(/\b\w+\b/g) ?? [];
 }
 
+/**
+ * Prepares a string for tokenization by splitting it into batches of lines, each of which is
+ * no longer than the specified maximum length.
+ *
+ * @param content The content to split into batches
+ * @param batchSize The maximum number of characters per batch
+ * @param maxLineLength The maximum length of a line
+ * @returns an array of batches of content
+ */
+export function batch(content: string, batchSize = 1000, maxLineLength = 1000): string[] {
+  const lines = content.split('\n').filter(({ length }) => length <= maxLineLength);
+  const result = [];
+  for (let i = 0; i < lines.length; i += batchSize) {
+    result.push(lines.slice(i, i + batchSize).join('\n'));
+  }
+
+  return result;
+}
+
 type FileTokens = {
   symbols: string[];
   words: string[];
 };
 
-export function fileTokens(
+export async function fileTokens(
   content: string,
   fileExtension: string,
   enableGenericSymbolParsing = true
-): FileTokens {
+): Promise<FileTokens> {
   if (enableGenericSymbolParsing)
     debug('Using generic symbol parsing for file extension: %s', fileExtension);
 
-  const symbolList = queryKeywords(
-    symbols(content, fileExtension, enableGenericSymbolParsing)
-  ).sort();
-  const wordList = queryKeywords(words(content)).sort();
+  const batches = batch(content);
+  const symbolList: string[] = [];
+  const wordList: string[] = [];
+
+  for (let i = 0; i < batches.length; ++i) {
+    if (i && i % 5 === 0) {
+      // Every 5th batch, wait for the next tick to avoid blocking the event loop
+      await new Promise((resolve) => setImmediate(resolve));
+    }
+
+    const batch = batches[i];
+    symbolList.push(...queryKeywords(symbols(batch, fileExtension, enableGenericSymbolParsing)));
+    wordList.push(...queryKeywords(words(batch)));
+  }
+
+  symbolList.sort();
+  wordList.sort();
 
   // Iterate through words, with a corresponding pointer to symbols.
   // If the word at the word index does not match the symbol at the symbol index,

diff --git a/packages/search/test/tokenize.spec.ts b/packages/search/test/tokenize.spec.ts
@@ -1,4 +1,4 @@
-import { symbols, words, fileTokens } from '../src/tokenize';
+import { symbols, words, fileTokens, batch } from '../src/tokenize';
 
 describe('FileTokens', () => {
   const content = `
@@ -29,9 +29,46 @@ describe('FileTokens', () => {
     ]);
   });
 
-  it('should extract file tokens (symbols and words) from the content', () => {
-    const result = fileTokens(content, fileExtension);
+  it('should extract file tokens (symbols and words) from the content', async () => {
+    const result = await fileTokens(content, fileExtension);
     expect(result.symbols).toEqual(['example', 'method1', 'method2']);
     expect(result.words).toEqual(['class', 'public', 'public', 'void', 'void']);
   });
+
+  it('yields to the event loop periodically', async () => {
+    const result = fileTokens('aaa\n'.repeat(10000), 'example');
+    const winner = await Promise.race([
+      result,
+      new Promise<'pass'>((resolve) => setImmediate(() => resolve('pass'))),
+    ]);
+    expect(winner).toStrictEqual('pass');
+    await expect(result).resolves.toBeDefined();
+  });
+
+  describe('batch', () => {
+    it('should split the content into batches of lines', () => {
+      const a = 'aaa\n'.repeat(100);
+      const b = 'bbb\n'.repeat(100);
+      const c = 'ccc\n'.repeat(50);
+      const content = a + b + c;
+      const result = batch(content, 100);
+      expect(result).toStrictEqual([a.slice(0, -1), b.slice(0, -1), c]);
+    });
+
+    it('returns text in batches which can be joined by new line to reconstruct the original text', () => {
+      const a = 'aaa\n'.repeat(100);
+      const b = 'bbb\n'.repeat(100);
+      const c = 'ccc\n'.repeat(50);
+      const content = a + b + c;
+      const result = batch(content, 100);
+      const joined = result.join('\n');
+      expect(joined).toEqual(content);
+    });
+
+    it('filters out lines longer than the max line length', () => {
+      const maxLen = 10;
+      const result = batch(`${'a'.repeat(maxLen + 1)}\nb\n${'c'.repeat(maxLen)}`, 100, maxLen);
+      expect(result).toStrictEqual([`b\n${'c'.repeat(maxLen)}`]);
+    });
+  });
 });