Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

perf: shorten chunk hashes from 64 chars to 12 chars #41

Merged
merged 1 commit into from
Dec 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions packages/reflect-yjs/src/chunk-config.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import * as base64 from 'base64-js';

export const AVG_CHUNK_SIZE_B = 1024;
export const MIN_CHUNK_SIZE_B = 256;
export const MAX_CHUNK_SIZE_B = 2048;

export const hashFn = async (chunk: Uint8Array) => {
const hashBuffer = await crypto.subtle.digest('SHA-256', chunk);
// Truncate the sha-256 hash from 32 bytes to 9 bytes. This gives us
// plenty of collision resistance for the range of expected document sizes.
// If we assume a max document size of 100MB, the probability of having a
// hash collision in a document of this size is roughly 1 in a trillion
// (based on the approximation function p(n) = n^2 / (2H)
// from
// https://en.wikipedia.org/wiki/Birthday_attack#Simple_approximation),
// where p(n) is probability of collision, n is number of hashes, and H
// is number of possible hash outputs. For 100MB document we have
// n = 100,000 (100,000 1KB chunks) and H = 2^(8*9),
// p(100,000) = (100,000^2 / (2*2^72)) = 1.0587912e-12
//
// In base64 9 bytes will encode to 12 chars with no padding (all chars
// contain information).
return base64.fromByteArray(new Uint8Array(hashBuffer.slice(0, 9)));
};
18 changes: 16 additions & 2 deletions packages/reflect-yjs/src/chunk.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,32 @@ import {expect, suite, test} from 'vitest';
import {chunk, unchunk} from './chunk.js';
import {TEST_TEXT_LEAR} from './chunk-test-text-lear.js';
import {TEST_TEXT_MOBY_DICK} from './chunk-test-text-moby-dick.js';
import {hashFn} from './chunk-config.js';

suite('chunk', () => {
test('chunk is pure', async () => {
const source = new TextEncoder().encode(TEST_TEXT_LEAR);
const {
chunksByHash: chunksByHash1,
sourceAsChunkHashes: sourceAsChunkHashes1,
} = await chunk(Math.pow(2, 10), Math.pow(2, 9), Math.pow(2, 11), source);
} = await chunk(
Math.pow(2, 10),
Math.pow(2, 9),
Math.pow(2, 11),
source,
hashFn,
);

const {
chunksByHash: chunksByHash2,
sourceAsChunkHashes: sourceAsChunkHashes2,
} = await chunk(Math.pow(2, 10), Math.pow(2, 9), Math.pow(2, 11), source);
} = await chunk(
Math.pow(2, 10),
Math.pow(2, 9),
Math.pow(2, 11),
source,
hashFn,
);

expect(chunksByHash1).toEqual(chunksByHash2);
expect(sourceAsChunkHashes1).toEqual(sourceAsChunkHashes2);
Expand All @@ -33,6 +46,7 @@ suite('chunk', () => {
min,
max,
source,
hashFn,
);

let totalExcludingLast = 0;
Expand Down
6 changes: 2 additions & 4 deletions packages/reflect-yjs/src/chunk.ts
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ export async function chunk(
minimum: number,
maximum: number,
source: Uint8Array,
hashFn: (chunk: Uint8Array) => Promise<string>,
): Promise<{
chunksByHash: Map<string, Uint8Array>;
sourceAsChunkHashes: string[];
Expand Down Expand Up @@ -159,10 +160,7 @@ export async function chunk(
}

const chunk = source.slice(sourceOffset, sourceOffset + chunkSize);
const hashBuffer = await crypto.subtle.digest('SHA-256', chunk);
const hash = Array.from(new Uint8Array(hashBuffer))
.map(b => b.toString(16).padStart(2, '0'))
.join('');
const hash = await hashFn(chunk);
if (!chunksByHash.has(hash)) {
chunksByHash.set(hash, chunk);
}
Expand Down
10 changes: 7 additions & 3 deletions packages/reflect-yjs/src/mutators.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,12 @@ import type {
import * as base64 from 'base64-js';
import * as Y from 'yjs';
import {chunk, unchunk} from './chunk.js';
import {
AVG_CHUNK_SIZE_B,
MAX_CHUNK_SIZE_B,
MIN_CHUNK_SIZE_B,
hashFn,
} from './chunk-config.js';

export const mutators = {
yjsSetLocalStateField,
Expand Down Expand Up @@ -96,9 +102,6 @@ function setClientUpdate(
return tx.set(yjsProviderClientUpdateKey(name, id), update);
}

const AVG_CHUNK_SIZE_B = 1024;
const MIN_CHUNK_SIZE_B = 256;
const MAX_CHUNK_SIZE_B = 2048;
async function setServerUpdate(
name: string,
update: Uint8Array,
Expand All @@ -116,6 +119,7 @@ async function setServerUpdate(
MIN_CHUNK_SIZE_B,
MAX_CHUNK_SIZE_B,
update,
hashFn,
);
const updateMeta: ChunkedUpdateMeta = {
chunkHashes: chunkInfo.sourceAsChunkHashes,
Expand Down