From 093aae9c7f545e873c594ab24fbfbd5be52e55a7 Mon Sep 17 00:00:00 2001 From: Sam Thorogood Date: Fri, 1 Sep 2017 13:29:12 +1000 Subject: [PATCH] exported encode/decode utf8 --- .gitignore | 2 ++ js-utf.js | 101 ++++++++++++++++++++++++++++++++++++++++++++++++++++ polyfill.js | 10 ++++++ test.html | 101 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 214 insertions(+) create mode 100644 .gitignore create mode 100644 js-utf.js create mode 100644 polyfill.js create mode 100644 test.html diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6cbb382 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +# sample data +*.txt diff --git a/js-utf.js b/js-utf.js new file mode 100644 index 0000000..0a9045a --- /dev/null +++ b/js-utf.js @@ -0,0 +1,101 @@ +/** + * @param {string} string + * @param {!Array|!TypedArray} target + * @param {number=} at position to write into target + * @return {number} the number of bytes written + */ +export function encode(string, target, at=0) { + const start = at; + let pos = 0; + const len = string.length; + const out = []; + + while (pos < len) { + let value = string.charCodeAt(pos++); + if (value >= 0xd800 && value <= 0xdbff) { + // high surrogate + if (pos < len) { + const extra = string.charCodeAt(pos); + if ((extra & 0xfc00) === 0xdc00) { + ++pos; + value = ((value & 0x3ff) << 10) + (extra & 0x3ff) + 0x10000; + } + } + if (value >= 0xd800 && value <= 0xdbff) { + continue; // drop lone surrogate + } + } + + if ((value & 0xffffff80) === 0) { // 1-byte + target[at++] = value; // ASCII + continue; + } else if ((value & 0xfffff800) === 0) { // 2-byte + target[at++] = ((value >> 6) & 0x1f) | 0xc0; + } else if ((value & 0xffff0000) === 0) { // 3-byte + target[at++] = ((value >> 12) & 0x0f) | 0xe0; + target[at++] = ((value >> 6) & 0x3f) | 0x80; + } else if ((value & 0xffe00000) === 0) { // 4-byte + target[at++] = ((value >> 18) & 0x07) | 0xf0; + target[at++] = ((value >> 12) & 0x3f) | 0x80; + target[at++] = ((value >> 6) & 0x3f) | 0x80; + } else { + // FIXME: do we care + continue; + } + + target[at++] = (value & 0x3f) | 0x80; + } + + return at - start; +} + +/** + * @param {!Array|!TypedArray} bytes + * @return {string} + */ +export function decode(bytes, pos=0, len=-1) { + const start = pos; + const out = []; + + if (len < 0) { + len = bytes.length; + } else { + len = Math.min(len, bytes.length); + } + + while (pos < len) { + const byte1 = bytes[pos++]; + if (!byte1) { + break; // NULL or null-like + } + + if ((byte1 & 0x80) === 0) { // 1-byte + out.push(byte1); + } else if ((byte1 & 0xe0) === 0xc0) { // 2-byte + const byte2 = bytes[pos++] & 0x3f; + out.push(((byte1 & 0x1f) << 6) | byte2); + } else if ((byte1 & 0xf0) === 0xe0) { + const byte2 = bytes[pos++] & 0x3f; + const byte3 = bytes[pos++] & 0x3f; + out.push(((byte1 & 0x1f) << 12) | (byte2 << 6) | byte3); + } else if ((byte1 & 0xf8) === 0xf0) { + const byte2 = bytes[pos++] & 0x3f; + const byte3 = bytes[pos++] & 0x3f; + const byte4 = bytes[pos++] & 0x3f; + + // this can be > 0xffff, so possibly generate surrogates + let codepoint = ((byte1 & 0x07) << 0x12) | (byte2 << 0x0c) | (byte3 << 0x06) | byte4; + if (codepoint > 0xffff) { + // codepoint &= ~0x10000; + codepoint -= 0x10000; + out.push((codepoint >>> 10) & 0x3ff | 0xd800) + codepoint = 0xdc00 | codepoint & 0x3ff; + } + out.push(codepoint); + } else { + // FIXME: we're ignoring this + } + } + + return String.fromCharCode(...out); +} \ No newline at end of file diff --git a/polyfill.js b/polyfill.js new file mode 100644 index 0000000..4063ea4 --- /dev/null +++ b/polyfill.js @@ -0,0 +1,10 @@ +window.module = { + exports: {}, +}; + +const hack = new Proxy({}, { + get() { + return []; + }, +}); +window.require = () => ({'encoding-indexes': hack}); diff --git a/test.html b/test.html new file mode 100644 index 0000000..3dbf66e --- /dev/null +++ b/test.html @@ -0,0 +1,101 @@ + + + + + + + + \ No newline at end of file