Skip to content

Commit

Permalink
fix: Tokenization no longer hangs the process
Browse files Browse the repository at this point in the history
Tokenization is now performed in batches and will occasionally yield
back to the main event loop instead of blocking the main thread
indefinitely.
  • Loading branch information
dustinbyrne committed Jan 21, 2025
1 parent eb0a840 commit c548d34
Show file tree
Hide file tree
Showing 4 changed files with 86 additions and 14 deletions.
4 changes: 2 additions & 2 deletions packages/search/src/build-file-index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ const debug = makeDebug('appmap:search:build-index');
export type Tokenizer = (
content: string,
fileExtension: string
) => { symbols: string[]; words: string[] };
) => Promise<{ symbols: string[]; words: string[] }>;

type Context = {
fileIndex: FileIndex;
Expand All @@ -38,7 +38,7 @@ async function indexFile(context: Context, filePath: string) {
fileContents.slice(0, 40)
);
const fileExtension = filePath.split('.').pop() ?? '';
const tokens = context.tokenizer(fileContents, fileExtension);
const tokens = await context.tokenizer(fileContents, fileExtension);
const symbols = tokens.symbols.join(' ');
const words = tokens.words.join(' ');

Expand Down
6 changes: 3 additions & 3 deletions packages/search/src/build-snippet-index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,19 +25,19 @@ async function indexFile(context: Context, file: File) {
const extension = file.filePath.split('.').pop() || '';
const chunks = await context.splitter(fileContent, extension);

chunks.forEach((chunk) => {
for (const chunk of chunks) {
const { content, startLine } = chunk;
const snippetId = fileChunkSnippetId(filePath, startLine);
const fileExtension = file.filePath.split('.').pop() ?? '';
const { symbols, words } = context.tokenizer(content, fileExtension);
const { symbols, words } = await context.tokenizer(content, fileExtension);
context.snippetIndex.indexSnippet(
snippetId,
file.directory,
symbols.join(' '),
words.join(' '),
content
);
});
}
}

export default async function buildSnippetIndex(
Expand Down
47 changes: 41 additions & 6 deletions packages/search/src/tokenize.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import { resolve } from 'node:path';
import queryKeywords from './query-keywords';
import makeDebug from 'debug';

Expand Down Expand Up @@ -59,23 +60,57 @@ export function words(content: string): string[] {
return content.match(/\b\w+\b/g) ?? [];
}

/**
* Prepares a string for tokenization by splitting it into batches of lines, each of which is
* no longer than the specified maximum length.
*
* @param content The content to split into batches
* @param batchSize The maximum number of characters per batch
* @param maxLineLength The maximum length of a line
* @returns an array of batches of content
*/
export function batch(content: string, batchSize = 1000, maxLineLength = 1000): string[] {
const lines = content.split('\n').filter(({ length }) => length <= maxLineLength);
const result = [];
for (let i = 0; i < lines.length; i += batchSize) {
result.push(lines.slice(i, i + batchSize).join('\n'));
}

return result;
}

type FileTokens = {
symbols: string[];
words: string[];
};

export function fileTokens(
export async function fileTokens(
content: string,
fileExtension: string,
enableGenericSymbolParsing = true
): FileTokens {
): Promise<FileTokens> {
if (enableGenericSymbolParsing)
debug('Using generic symbol parsing for file extension: %s', fileExtension);

const symbolList = queryKeywords(
symbols(content, fileExtension, enableGenericSymbolParsing)
).sort();
const wordList = queryKeywords(words(content)).sort();
const batches = batch(content);
const symbolList: string[] = [];
const wordList: string[] = [];

console.log(`batch size: ${batches.length}`);
for (let i = 0; i < batches.length; ++i) {
if (i && i % 5 === 0) {
// Every 5th batch, wait for the next tick to avoid blocking the event loop
await new Promise((resolve) => setImmediate(resolve));
console.log(`yield ${i}`);
}

const batch = batches[i];
symbolList.push(...queryKeywords(symbols(batch, fileExtension, enableGenericSymbolParsing)));
wordList.push(...queryKeywords(words(batch)));
}

symbolList.sort();
wordList.sort();

// Iterate through words, with a corresponding pointer to symbols.
// If the word at the word index does not match the symbol at the symbol index,
Expand Down
43 changes: 40 additions & 3 deletions packages/search/test/tokenize.spec.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { symbols, words, fileTokens } from '../src/tokenize';
import { symbols, words, fileTokens, batch } from '../src/tokenize';

describe('FileTokens', () => {
const content = `
Expand Down Expand Up @@ -29,9 +29,46 @@ describe('FileTokens', () => {
]);
});

it('should extract file tokens (symbols and words) from the content', () => {
const result = fileTokens(content, fileExtension);
it('should extract file tokens (symbols and words) from the content', async () => {
const result = await fileTokens(content, fileExtension);
expect(result.symbols).toEqual(['example', 'method1', 'method2']);
expect(result.words).toEqual(['class', 'public', 'public', 'void', 'void']);
});

it('yields to the event loop periodically', async () => {
const result = fileTokens('aaa\n'.repeat(10000), 'example');
const winner = await Promise.race([
result,
new Promise<'pass'>((resolve) => setImmediate(() => resolve('pass'))),
]);
expect(winner).toStrictEqual('pass');
await expect(result).resolves.toBeDefined();
});

describe('batch', () => {
it('should split the content into batches of lines', () => {
const a = 'aaa\n'.repeat(100);
const b = 'bbb\n'.repeat(100);
const c = 'ccc\n'.repeat(50);
const content = a + b + c;
const result = batch(content, 100);
expect(result).toStrictEqual([a.slice(0, -1), b.slice(0, -1), c]);
});

it('returns text in batches which can be joined by new line to reconstruct the original text', () => {
const a = 'aaa\n'.repeat(100);
const b = 'bbb\n'.repeat(100);
const c = 'ccc\n'.repeat(50);
const content = a + b + c;
const result = batch(content, 100);
const joined = result.join('\n');
expect(joined).toEqual(content);
});

it('filters out lines longer than the max line length', () => {
const maxLen = 10;
const result = batch(`${'a'.repeat(maxLen + 1)}\nb\n${'c'.repeat(maxLen)}`, 100, maxLen);
expect(result).toStrictEqual([`b\n${'c'.repeat(maxLen)}`]);
});
});
});

0 comments on commit c548d34

Please sign in to comment.