From bfe04008963fa57be6b21238694e41de2f0a3a90 Mon Sep 17 00:00:00 2001 From: Gregory Baker Date: Mon, 18 Dec 2023 13:21:20 -0700 Subject: [PATCH] perf: shorten hashes from 64 chars to 12 chars --- packages/reflect-yjs/src/chunk-config.ts | 24 ++++++++++++++++++++++++ packages/reflect-yjs/src/chunk.test.ts | 18 ++++++++++++++++-- packages/reflect-yjs/src/chunk.ts | 6 ++---- packages/reflect-yjs/src/mutators.ts | 10 +++++++--- 4 files changed, 49 insertions(+), 9 deletions(-) create mode 100644 packages/reflect-yjs/src/chunk-config.ts diff --git a/packages/reflect-yjs/src/chunk-config.ts b/packages/reflect-yjs/src/chunk-config.ts new file mode 100644 index 0000000..9d75e67 --- /dev/null +++ b/packages/reflect-yjs/src/chunk-config.ts @@ -0,0 +1,24 @@ +import * as base64 from 'base64-js'; + +export const AVG_CHUNK_SIZE_B = 1024; +export const MIN_CHUNK_SIZE_B = 256; +export const MAX_CHUNK_SIZE_B = 2048; + +export const hashFn = async (chunk: Uint8Array) => { + const hashBuffer = await crypto.subtle.digest('SHA-256', chunk); + // Truncate the sha-256 hash from 32 bytes to 9 bytes. This gives us + // plenty of collision resistance for the range of expected document sizes. + // If we assume a max document size of 100MB, the probability of having a + // hash collision in a document of this size is roughly 1 in a trillion + // (based on the approximation function p(n) = n^2 / (2H) + // from + // https://en.wikipedia.org/wiki/Birthday_attack#Simple_approximation), + // where p(n) is probability of collision, n is number of hashes, and H + // is number of possible hash outputs. For 100MB document we have + // n = 100,000 (100,000 1KB chunks) and H = 2^(8*9), + // p(100,000) = (100,000^2 / (2*2^72)) = 1.0587912e-12 + // + // In base64 9 bytes will encode to 12 chars with no padding (all chars + // contain information). + return base64.fromByteArray(new Uint8Array(hashBuffer.slice(0, 9))); +}; diff --git a/packages/reflect-yjs/src/chunk.test.ts b/packages/reflect-yjs/src/chunk.test.ts index 9ba2429..527376c 100644 --- a/packages/reflect-yjs/src/chunk.test.ts +++ b/packages/reflect-yjs/src/chunk.test.ts @@ -2,6 +2,7 @@ import {expect, suite, test} from 'vitest'; import {chunk, unchunk} from './chunk.js'; import {TEST_TEXT_LEAR} from './chunk-test-text-lear.js'; import {TEST_TEXT_MOBY_DICK} from './chunk-test-text-moby-dick.js'; +import {hashFn} from './chunk-config.js'; suite('chunk', () => { test('chunk is pure', async () => { @@ -9,12 +10,24 @@ suite('chunk', () => { const { chunksByHash: chunksByHash1, sourceAsChunkHashes: sourceAsChunkHashes1, - } = await chunk(Math.pow(2, 10), Math.pow(2, 9), Math.pow(2, 11), source); + } = await chunk( + Math.pow(2, 10), + Math.pow(2, 9), + Math.pow(2, 11), + source, + hashFn, + ); const { chunksByHash: chunksByHash2, sourceAsChunkHashes: sourceAsChunkHashes2, - } = await chunk(Math.pow(2, 10), Math.pow(2, 9), Math.pow(2, 11), source); + } = await chunk( + Math.pow(2, 10), + Math.pow(2, 9), + Math.pow(2, 11), + source, + hashFn, + ); expect(chunksByHash1).toEqual(chunksByHash2); expect(sourceAsChunkHashes1).toEqual(sourceAsChunkHashes2); @@ -33,6 +46,7 @@ suite('chunk', () => { min, max, source, + hashFn, ); let totalExcludingLast = 0; diff --git a/packages/reflect-yjs/src/chunk.ts b/packages/reflect-yjs/src/chunk.ts index ff0a517..32ec542 100644 --- a/packages/reflect-yjs/src/chunk.ts +++ b/packages/reflect-yjs/src/chunk.ts @@ -109,6 +109,7 @@ export async function chunk( minimum: number, maximum: number, source: Uint8Array, + hashFn: (chunk: Uint8Array) => Promise, ): Promise<{ chunksByHash: Map; sourceAsChunkHashes: string[]; @@ -159,10 +160,7 @@ export async function chunk( } const chunk = source.slice(sourceOffset, sourceOffset + chunkSize); - const hashBuffer = await crypto.subtle.digest('SHA-256', chunk); - const hash = Array.from(new Uint8Array(hashBuffer)) - .map(b => b.toString(16).padStart(2, '0')) - .join(''); + const hash = await hashFn(chunk); if (!chunksByHash.has(hash)) { chunksByHash.set(hash, chunk); } diff --git a/packages/reflect-yjs/src/mutators.ts b/packages/reflect-yjs/src/mutators.ts index f1d67a7..d2836cf 100644 --- a/packages/reflect-yjs/src/mutators.ts +++ b/packages/reflect-yjs/src/mutators.ts @@ -8,6 +8,12 @@ import type { import * as base64 from 'base64-js'; import * as Y from 'yjs'; import {chunk, unchunk} from './chunk.js'; +import { + AVG_CHUNK_SIZE_B, + MAX_CHUNK_SIZE_B, + MIN_CHUNK_SIZE_B, + hashFn, +} from './chunk-config.js'; export const mutators = { yjsSetLocalStateField, @@ -96,9 +102,6 @@ function setClientUpdate( return tx.set(yjsProviderClientUpdateKey(name, id), update); } -const AVG_CHUNK_SIZE_B = 1024; -const MIN_CHUNK_SIZE_B = 256; -const MAX_CHUNK_SIZE_B = 2048; async function setServerUpdate( name: string, update: Uint8Array, @@ -116,6 +119,7 @@ async function setServerUpdate( MIN_CHUNK_SIZE_B, MAX_CHUNK_SIZE_B, update, + hashFn, ); const updateMeta: ChunkedUpdateMeta = { chunkHashes: chunkInfo.sourceAsChunkHashes,