From 832400f982aed82f74fe756ca76bbcb7c3893b5d Mon Sep 17 00:00:00 2001
From: Dustin Byrne <dustinsbyrne@gmail.com>
Date: Tue, 21 Jan 2025 17:12:41 -0500
Subject: [PATCH 1/4] fix: Performance issue when chunking large documents

The content would be split for each chunk, causing the process to hang
if the file was large enough.
---
 packages/search/src/splitter.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packages/search/src/splitter.ts b/packages/search/src/splitter.ts
index a71955128d..ef3f5aa33e 100644
--- a/packages/search/src/splitter.ts
+++ b/packages/search/src/splitter.ts
@@ -47,6 +47,7 @@ export async function langchainSplitter(content: string, fileExtension: string):
     splitter = new RecursiveCharacterTextSplitter();
   }
   const documents = await splitter.createDocuments([content]);
+  const contentLines = content.split('\n');
 
   // metadata includes:
   // { loc: { lines: { from: 1, to: 14 } } }
@@ -58,7 +59,6 @@ export async function langchainSplitter(content: string, fileExtension: string):
       content: '',
     };
     if (lines) {
-      const contentLines = content.split('\n');
       result.content = contentLines.slice(lines.from - 1, lines.to).join('\n');
       result.startLine = lines.from;
       result.endLine = lines.to;

From 5db40a8377b483f0a3e26654950d91b1ebaed23d Mon Sep 17 00:00:00 2001
From: Dustin Byrne <dustinsbyrne@gmail.com>
Date: Tue, 21 Jan 2025 17:14:50 -0500
Subject: [PATCH 2/4] fix: Prevent re-tokenization of chunks

---
 packages/search/src/build-snippet-index.ts | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/packages/search/src/build-snippet-index.ts b/packages/search/src/build-snippet-index.ts
index 7e86ef65e0..67831769e8 100644
--- a/packages/search/src/build-snippet-index.ts
+++ b/packages/search/src/build-snippet-index.ts
@@ -28,11 +28,12 @@ async function indexFile(context: Context, file: File) {
   chunks.forEach((chunk) => {
     const { content, startLine } = chunk;
     const snippetId = fileChunkSnippetId(filePath, startLine);
+    const { symbols, words } = context.tokenizer(content, file.filePath);
     context.snippetIndex.indexSnippet(
       snippetId,
       file.directory,
-      context.tokenizer(content, file.filePath).symbols.join(' '),
-      context.tokenizer(content, file.filePath).words.join(' '),
+      symbols.join(' '),
+      words.join(' '),
       content
     );
   });

From 2aa177aa46a8d10ce5e4fd72e2c0002ad11ed448 Mon Sep 17 00:00:00 2001
From: Dustin Byrne <dustinsbyrne@gmail.com>
Date: Wed, 22 Jan 2025 13:08:12 -0500
Subject: [PATCH 3/4] fix: Tokenization will consider the file type

---
 packages/search/src/build-file-index.ts    | 3 ++-
 packages/search/src/build-snippet-index.ts | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/packages/search/src/build-file-index.ts b/packages/search/src/build-file-index.ts
index 377f6f4fad..72f7ddb0b1 100644
--- a/packages/search/src/build-file-index.ts
+++ b/packages/search/src/build-file-index.ts
@@ -37,7 +37,8 @@ async function indexFile(context: Context, filePath: string) {
     fileContents.length,
     fileContents.slice(0, 40)
   );
-  const tokens = context.tokenizer(fileContents, filePath);
+  const fileExtension = filePath.split('.').pop() ?? '';
+  const tokens = context.tokenizer(fileContents, fileExtension);
   const symbols = tokens.symbols.join(' ');
   const words = tokens.words.join(' ');
 
diff --git a/packages/search/src/build-snippet-index.ts b/packages/search/src/build-snippet-index.ts
index 67831769e8..67fd3f75cb 100644
--- a/packages/search/src/build-snippet-index.ts
+++ b/packages/search/src/build-snippet-index.ts
@@ -28,7 +28,8 @@ async function indexFile(context: Context, file: File) {
   chunks.forEach((chunk) => {
     const { content, startLine } = chunk;
     const snippetId = fileChunkSnippetId(filePath, startLine);
-    const { symbols, words } = context.tokenizer(content, file.filePath);
+    const fileExtension = file.filePath.split('.').pop() ?? '';
+    const { symbols, words } = context.tokenizer(content, fileExtension);
     context.snippetIndex.indexSnippet(
       snippetId,
       file.directory,

From 6a6c2184ae23d6f72fbfcbb1d957ab67bed6a1b9 Mon Sep 17 00:00:00 2001
From: Dustin Byrne <dustinsbyrne@gmail.com>
Date: Tue, 21 Jan 2025 17:19:32 -0500
Subject: [PATCH 4/4] fix: Tokenization no longer hangs the process

Tokenization is now performed in batches and will occasionally yield
back to the main event loop instead of blocking the main thread
indefinitely.
---
 .../cli/src/rpc/explain/index/appmap-index.ts |  4 +-
 .../rpc/explain/index/appmap-index.spec.ts    |  2 +-
 packages/search/src/build-file-index.ts       |  4 +-
 packages/search/src/build-snippet-index.ts    |  6 +--
 packages/search/src/tokenize.ts               | 44 ++++++++++++++++---
 packages/search/test/tokenize.spec.ts         | 43 ++++++++++++++++--
 6 files changed, 87 insertions(+), 16 deletions(-)

diff --git a/packages/cli/src/rpc/explain/index/appmap-index.ts b/packages/cli/src/rpc/explain/index/appmap-index.ts
index 17ff308fc7..8a1eee4e99 100644
--- a/packages/cli/src/rpc/explain/index/appmap-index.ts
+++ b/packages/cli/src/rpc/explain/index/appmap-index.ts
@@ -126,7 +126,9 @@ export async function readAppMapContent(appmapFile: string): Promise<string> {
   appmapWords.push(...parameters);
   appmapWords.push(...types);
 
-  return appmapWords.join(' ');
+  // Words are separated by new lines to reduce the time spent tokenizing the content.
+  // Long lines may be slower to tokenize or be skipped altogether.
+  return appmapWords.join('\n');
 }
 
 export function trueFilter(): Promise<boolean> {
diff --git a/packages/cli/tests/unit/rpc/explain/index/appmap-index.spec.ts b/packages/cli/tests/unit/rpc/explain/index/appmap-index.spec.ts
index b51c47be4b..9746c8a558 100644
--- a/packages/cli/tests/unit/rpc/explain/index/appmap-index.spec.ts
+++ b/packages/cli/tests/unit/rpc/explain/index/appmap-index.spec.ts
@@ -63,7 +63,7 @@ describe('readAppMapContent', () => {
     expect(content).toContain('sql');
     expect(content).toContain('database');
 
-    expect(content.split(' ')).toEqual([
+    expect(content.split(/\s+/)).toEqual([
       'Test',
       'AppMap',
       'test',
diff --git a/packages/search/src/build-file-index.ts b/packages/search/src/build-file-index.ts
index 72f7ddb0b1..bfb59113ff 100644
--- a/packages/search/src/build-file-index.ts
+++ b/packages/search/src/build-file-index.ts
@@ -15,7 +15,7 @@ const debug = makeDebug('appmap:search:build-index');
 export type Tokenizer = (
   content: string,
   fileExtension: string
-) => { symbols: string[]; words: string[] };
+) => Promise<{ symbols: string[]; words: string[] }>;
 
 type Context = {
   fileIndex: FileIndex;
@@ -38,7 +38,7 @@ async function indexFile(context: Context, filePath: string) {
     fileContents.slice(0, 40)
   );
   const fileExtension = filePath.split('.').pop() ?? '';
-  const tokens = context.tokenizer(fileContents, fileExtension);
+  const tokens = await context.tokenizer(fileContents, fileExtension);
   const symbols = tokens.symbols.join(' ');
   const words = tokens.words.join(' ');
 
diff --git a/packages/search/src/build-snippet-index.ts b/packages/search/src/build-snippet-index.ts
index 67fd3f75cb..d9d5dae43f 100644
--- a/packages/search/src/build-snippet-index.ts
+++ b/packages/search/src/build-snippet-index.ts
@@ -25,11 +25,11 @@ async function indexFile(context: Context, file: File) {
   const extension = file.filePath.split('.').pop() || '';
   const chunks = await context.splitter(fileContent, extension);
 
-  chunks.forEach((chunk) => {
+  for (const chunk of chunks) {
     const { content, startLine } = chunk;
     const snippetId = fileChunkSnippetId(filePath, startLine);
     const fileExtension = file.filePath.split('.').pop() ?? '';
-    const { symbols, words } = context.tokenizer(content, fileExtension);
+    const { symbols, words } = await context.tokenizer(content, fileExtension);
     context.snippetIndex.indexSnippet(
       snippetId,
       file.directory,
@@ -37,7 +37,7 @@ async function indexFile(context: Context, file: File) {
       words.join(' '),
       content
     );
-  });
+  }
 }
 
 export default async function buildSnippetIndex(
diff --git a/packages/search/src/tokenize.ts b/packages/search/src/tokenize.ts
index caf9a49e14..0a9e2b78bf 100644
--- a/packages/search/src/tokenize.ts
+++ b/packages/search/src/tokenize.ts
@@ -59,23 +59,55 @@ export function words(content: string): string[] {
   return content.match(/\b\w+\b/g) ?? [];
 }
 
+/**
+ * Prepares a string for tokenization by splitting it into batches of lines, each of which is
+ * no longer than the specified maximum length.
+ *
+ * @param content The content to split into batches
+ * @param batchSize The maximum number of characters per batch
+ * @param maxLineLength The maximum length of a line
+ * @returns an array of batches of content
+ */
+export function batch(content: string, batchSize = 1000, maxLineLength = 1000): string[] {
+  const lines = content.split('\n').filter(({ length }) => length <= maxLineLength);
+  const result = [];
+  for (let i = 0; i < lines.length; i += batchSize) {
+    result.push(lines.slice(i, i + batchSize).join('\n'));
+  }
+
+  return result;
+}
+
 type FileTokens = {
   symbols: string[];
   words: string[];
 };
 
-export function fileTokens(
+export async function fileTokens(
   content: string,
   fileExtension: string,
   enableGenericSymbolParsing = true
-): FileTokens {
+): Promise<FileTokens> {
   if (enableGenericSymbolParsing)
     debug('Using generic symbol parsing for file extension: %s', fileExtension);
 
-  const symbolList = queryKeywords(
-    symbols(content, fileExtension, enableGenericSymbolParsing)
-  ).sort();
-  const wordList = queryKeywords(words(content)).sort();
+  const batches = batch(content);
+  const symbolList: string[] = [];
+  const wordList: string[] = [];
+
+  for (let i = 0; i < batches.length; ++i) {
+    if (i && i % 5 === 0) {
+      // Every 5th batch, wait for the next tick to avoid blocking the event loop
+      await new Promise((resolve) => setImmediate(resolve));
+    }
+
+    const batch = batches[i];
+    symbolList.push(...queryKeywords(symbols(batch, fileExtension, enableGenericSymbolParsing)));
+    wordList.push(...queryKeywords(words(batch)));
+  }
+
+  symbolList.sort();
+  wordList.sort();
 
   // Iterate through words, with a corresponding pointer to symbols.
   // If the word at the word index does not match the symbol at the symbol index,
diff --git a/packages/search/test/tokenize.spec.ts b/packages/search/test/tokenize.spec.ts
index 4e32df4438..e61cae8c03 100644
--- a/packages/search/test/tokenize.spec.ts
+++ b/packages/search/test/tokenize.spec.ts
@@ -1,4 +1,4 @@
-import { symbols, words, fileTokens } from '../src/tokenize';
+import { symbols, words, fileTokens, batch } from '../src/tokenize';
 
 describe('FileTokens', () => {
   const content = `
@@ -29,9 +29,46 @@ describe('FileTokens', () => {
     ]);
   });
 
-  it('should extract file tokens (symbols and words) from the content', () => {
-    const result = fileTokens(content, fileExtension);
+  it('should extract file tokens (symbols and words) from the content', async () => {
+    const result = await fileTokens(content, fileExtension);
     expect(result.symbols).toEqual(['example', 'method1', 'method2']);
     expect(result.words).toEqual(['class', 'public', 'public', 'void', 'void']);
   });
+
+  it('yields to the event loop periodically', async () => {
+    const result = fileTokens('aaa\n'.repeat(10000), 'example');
+    const winner = await Promise.race([
+      result,
+      new Promise<'pass'>((resolve) => setImmediate(() => resolve('pass'))),
+    ]);
+    expect(winner).toStrictEqual('pass');
+    await expect(result).resolves.toBeDefined();
+  });
+
+  describe('batch', () => {
+    it('should split the content into batches of lines', () => {
+      const a = 'aaa\n'.repeat(100);
+      const b = 'bbb\n'.repeat(100);
+      const c = 'ccc\n'.repeat(50);
+      const content = a + b + c;
+      const result = batch(content, 100);
+      expect(result).toStrictEqual([a.slice(0, -1), b.slice(0, -1), c]);
+    });
+
+    it('returns text in batches which can be joined by new line to reconstruct the original text', () => {
+      const a = 'aaa\n'.repeat(100);
+      const b = 'bbb\n'.repeat(100);
+      const c = 'ccc\n'.repeat(50);
+      const content = a + b + c;
+      const result = batch(content, 100);
+      const joined = result.join('\n');
+      expect(joined).toEqual(content);
+    });
+
+    it('filters out lines longer than the max line length', () => {
+      const maxLen = 10;
+      const result = batch(`${'a'.repeat(maxLen + 1)}\nb\n${'c'.repeat(maxLen)}`, 100, maxLen);
+      expect(result).toStrictEqual([`b\n${'c'.repeat(maxLen)}`]);
+    });
+  });
 });