From 01c3bf62bad0cd9b9763b9d09ce1e015038af79d Mon Sep 17 00:00:00 2001 From: Gregor MacLennan Date: Thu, 26 Oct 2023 11:10:27 +0900 Subject: [PATCH] chore: avoid known file signatures in datatypeId (#155) This is just a performance optimization for the Mapeo indexer so that it avoids trying to parse files that are not Mapeo Docs. For example, a hypercore might have PNG files written to it, which is prefixed by '89 50 4E 47 0D 0A 1A 0A'. If we used this as a dataTypeId then the indexer would think any PNGs in the core are a Mapeo datatype and try to parse them. It would fail and just be ignored, but trying to parse would have a performance cost. This is a check in the build script that will throw an error if a new dataType is added that matches one of the known file signature prefixes. In some cases we don't check against the whole file signature - we just avoid starting data type IDs with byte(s) that are common in file signatures. --- scripts/lib/parse-config.js | 43 +++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/scripts/lib/parse-config.js b/scripts/lib/parse-config.js index c4b45550..d24dad17 100644 --- a/scripts/lib/parse-config.js +++ b/scripts/lib/parse-config.js @@ -8,6 +8,48 @@ import { capitalize, PROJECT_ROOT } from './utils.js' // These messages are embedded in others and do not define Mapeo data types const EMBEDDED_MESSAGES = ['tags', 'common'] +// We avoid creating data type IDs that match these, since blobs (e.g. icons) +// can be stored in Mapeo hypercores, and we want to avoid trying to parse a +// file blob as a Mapeo datatype. This just minimizes cases where the Mapeo +// indexer might try to parse (and fail) a document that is not actually a Mapeo +// doc. +const KNOWN_FILE_SIGNATURE_PREFIXES = [ + [0xef, 0xbb, 0xbf], // UTF-8 BOM + [0xfe, 0xff], // UTF-16 BOM + [0x3c, 0x3f, 0x78, 0x6d, 0x6c], // ` { + let doesMatch = true + for (let i = 0; i < Math.min(prefix.length, 6); i++) { + if (prefix[i] !== buf[i]) { + doesMatch = false + } + } + return doesMatch + } + ) + if (matchingKnownFileSignature) { + throw new Error( + 'This datatype ID (' + + dataTypeId + + ') matches a known file signature, please choose a different one' + ) + } +} + /** * Parse the proto message types and check: * @@ -65,6 +107,7 @@ export function parseConfig() { throw new Error('Duplicate dataTypeId in ' + filepath) } duplicateIdCheck.set(dataTypeId, schemaName) + validateDatatypeId(dataTypeId) dataTypeIds[schemaName] = dataTypeId