Skip to content

Commit

Permalink
perf: shorten hashes from 64 chars to 12 chars
Browse files Browse the repository at this point in the history
  • Loading branch information
grgbkr committed Dec 18, 2023
1 parent 80ec3c8 commit bfe0400
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 9 deletions.
24 changes: 24 additions & 0 deletions packages/reflect-yjs/src/chunk-config.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import * as base64 from 'base64-js';

export const AVG_CHUNK_SIZE_B = 1024;
export const MIN_CHUNK_SIZE_B = 256;
export const MAX_CHUNK_SIZE_B = 2048;

export const hashFn = async (chunk: Uint8Array) => {
const hashBuffer = await crypto.subtle.digest('SHA-256', chunk);
// Truncate the sha-256 hash from 32 bytes to 9 bytes. This gives us
// plenty of collision resistance for the range of expected document sizes.
// If we assume a max document size of 100MB, the probability of having a
// hash collision in a document of this size is roughly 1 in a trillion
// (based on the approximation function p(n) = n^2 / (2H)
// from
// https://en.wikipedia.org/wiki/Birthday_attack#Simple_approximation),
// where p(n) is probability of collision, n is number of hashes, and H
// is number of possible hash outputs. For 100MB document we have
// n = 100,000 (100,000 1KB chunks) and H = 2^(8*9),
// p(100,000) = (100,000^2 / (2*2^72)) = 1.0587912e-12
//
// In base64 9 bytes will encode to 12 chars with no padding (all chars
// contain information).
return base64.fromByteArray(new Uint8Array(hashBuffer.slice(0, 9)));
};
18 changes: 16 additions & 2 deletions packages/reflect-yjs/src/chunk.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,32 @@ import {expect, suite, test} from 'vitest';
import {chunk, unchunk} from './chunk.js';
import {TEST_TEXT_LEAR} from './chunk-test-text-lear.js';
import {TEST_TEXT_MOBY_DICK} from './chunk-test-text-moby-dick.js';
import {hashFn} from './chunk-config.js';

suite('chunk', () => {
test('chunk is pure', async () => {
const source = new TextEncoder().encode(TEST_TEXT_LEAR);
const {
chunksByHash: chunksByHash1,
sourceAsChunkHashes: sourceAsChunkHashes1,
} = await chunk(Math.pow(2, 10), Math.pow(2, 9), Math.pow(2, 11), source);
} = await chunk(
Math.pow(2, 10),
Math.pow(2, 9),
Math.pow(2, 11),
source,
hashFn,
);

const {
chunksByHash: chunksByHash2,
sourceAsChunkHashes: sourceAsChunkHashes2,
} = await chunk(Math.pow(2, 10), Math.pow(2, 9), Math.pow(2, 11), source);
} = await chunk(
Math.pow(2, 10),
Math.pow(2, 9),
Math.pow(2, 11),
source,
hashFn,
);

expect(chunksByHash1).toEqual(chunksByHash2);
expect(sourceAsChunkHashes1).toEqual(sourceAsChunkHashes2);
Expand All @@ -33,6 +46,7 @@ suite('chunk', () => {
min,
max,
source,
hashFn,
);

let totalExcludingLast = 0;
Expand Down
6 changes: 2 additions & 4 deletions packages/reflect-yjs/src/chunk.ts
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ export async function chunk(
minimum: number,
maximum: number,
source: Uint8Array,
hashFn: (chunk: Uint8Array) => Promise<string>,
): Promise<{
chunksByHash: Map<string, Uint8Array>;
sourceAsChunkHashes: string[];
Expand Down Expand Up @@ -159,10 +160,7 @@ export async function chunk(
}

const chunk = source.slice(sourceOffset, sourceOffset + chunkSize);
const hashBuffer = await crypto.subtle.digest('SHA-256', chunk);
const hash = Array.from(new Uint8Array(hashBuffer))
.map(b => b.toString(16).padStart(2, '0'))
.join('');
const hash = await hashFn(chunk);
if (!chunksByHash.has(hash)) {
chunksByHash.set(hash, chunk);
}
Expand Down
10 changes: 7 additions & 3 deletions packages/reflect-yjs/src/mutators.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,12 @@ import type {
import * as base64 from 'base64-js';
import * as Y from 'yjs';
import {chunk, unchunk} from './chunk.js';
import {
AVG_CHUNK_SIZE_B,
MAX_CHUNK_SIZE_B,
MIN_CHUNK_SIZE_B,
hashFn,
} from './chunk-config.js';

export const mutators = {
yjsSetLocalStateField,
Expand Down Expand Up @@ -96,9 +102,6 @@ function setClientUpdate(
return tx.set(yjsProviderClientUpdateKey(name, id), update);
}

const AVG_CHUNK_SIZE_B = 1024;
const MIN_CHUNK_SIZE_B = 256;
const MAX_CHUNK_SIZE_B = 2048;
async function setServerUpdate(
name: string,
update: Uint8Array,
Expand All @@ -116,6 +119,7 @@ async function setServerUpdate(
MIN_CHUNK_SIZE_B,
MAX_CHUNK_SIZE_B,
update,
hashFn,
);
const updateMeta: ChunkedUpdateMeta = {
chunkHashes: chunkInfo.sourceAsChunkHashes,
Expand Down

0 comments on commit bfe0400

Please sign in to comment.