From 1ea7660dee95ba8252d40782d6bdbb5cc226d99a Mon Sep 17 00:00:00 2001 From: Sam Thorogood Date: Fri, 1 Sep 2017 14:29:41 +1000 Subject: [PATCH] rewrite to polyfill of TextEncoder/TextDecoder --- js-utf.js | 101 --------------------------- test.html | 84 +++++++---------------- text.js | 199 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 225 insertions(+), 159 deletions(-) delete mode 100644 js-utf.js create mode 100644 text.js diff --git a/js-utf.js b/js-utf.js deleted file mode 100644 index 0a9045a..0000000 --- a/js-utf.js +++ /dev/null @@ -1,101 +0,0 @@ -/** - * @param {string} string - * @param {!Array|!TypedArray} target - * @param {number=} at position to write into target - * @return {number} the number of bytes written - */ -export function encode(string, target, at=0) { - const start = at; - let pos = 0; - const len = string.length; - const out = []; - - while (pos < len) { - let value = string.charCodeAt(pos++); - if (value >= 0xd800 && value <= 0xdbff) { - // high surrogate - if (pos < len) { - const extra = string.charCodeAt(pos); - if ((extra & 0xfc00) === 0xdc00) { - ++pos; - value = ((value & 0x3ff) << 10) + (extra & 0x3ff) + 0x10000; - } - } - if (value >= 0xd800 && value <= 0xdbff) { - continue; // drop lone surrogate - } - } - - if ((value & 0xffffff80) === 0) { // 1-byte - target[at++] = value; // ASCII - continue; - } else if ((value & 0xfffff800) === 0) { // 2-byte - target[at++] = ((value >> 6) & 0x1f) | 0xc0; - } else if ((value & 0xffff0000) === 0) { // 3-byte - target[at++] = ((value >> 12) & 0x0f) | 0xe0; - target[at++] = ((value >> 6) & 0x3f) | 0x80; - } else if ((value & 0xffe00000) === 0) { // 4-byte - target[at++] = ((value >> 18) & 0x07) | 0xf0; - target[at++] = ((value >> 12) & 0x3f) | 0x80; - target[at++] = ((value >> 6) & 0x3f) | 0x80; - } else { - // FIXME: do we care - continue; - } - - target[at++] = (value & 0x3f) | 0x80; - } - - return at - start; -} - -/** - * @param {!Array|!TypedArray} bytes - * @return {string} - */ -export function decode(bytes, pos=0, len=-1) { - const start = pos; - const out = []; - - if (len < 0) { - len = bytes.length; - } else { - len = Math.min(len, bytes.length); - } - - while (pos < len) { - const byte1 = bytes[pos++]; - if (!byte1) { - break; // NULL or null-like - } - - if ((byte1 & 0x80) === 0) { // 1-byte - out.push(byte1); - } else if ((byte1 & 0xe0) === 0xc0) { // 2-byte - const byte2 = bytes[pos++] & 0x3f; - out.push(((byte1 & 0x1f) << 6) | byte2); - } else if ((byte1 & 0xf0) === 0xe0) { - const byte2 = bytes[pos++] & 0x3f; - const byte3 = bytes[pos++] & 0x3f; - out.push(((byte1 & 0x1f) << 12) | (byte2 << 6) | byte3); - } else if ((byte1 & 0xf8) === 0xf0) { - const byte2 = bytes[pos++] & 0x3f; - const byte3 = bytes[pos++] & 0x3f; - const byte4 = bytes[pos++] & 0x3f; - - // this can be > 0xffff, so possibly generate surrogates - let codepoint = ((byte1 & 0x07) << 0x12) | (byte2 << 0x0c) | (byte3 << 0x06) | byte4; - if (codepoint > 0xffff) { - // codepoint &= ~0x10000; - codepoint -= 0x10000; - out.push((codepoint >>> 10) & 0x3ff | 0xd800) - codepoint = 0xdc00 | codepoint & 0x3ff; - } - out.push(codepoint); - } else { - // FIXME: we're ignoring this - } - } - - return String.fromCharCode(...out); -} \ No newline at end of file diff --git a/test.html b/test.html index 3dbf66e..a80122f 100644 --- a/test.html +++ b/test.html @@ -9,14 +9,35 @@ import 'https://cdn.rawgit.com/inexorabletash/text-encoding/b98ab30b/lib/encoding.js'; const polyfill = module.exports; - import * as utf from './js-utf.js'; + import './text.js'; - const runs = 100; + const runs = 1; const dataUrl = './utf8_sequence_0-0xffff_assigned_printable.txt'; (async function() { let text = await window.fetch(dataUrl).then((data) => data.text()); text = text.substr(0, 10000); + + function testEncodeDecode(name, tenc, tdec) { + console.time(name + '.TextEncoder'); + let saved; + const encoder = new tenc(); + for (let i = 0; i < runs; ++i) { + const out = encoder.encode(text); + saved = out; + } + console.info('got output', saved); + console.timeEnd(name + '.TextEncoder'); + + console.time(name + '.TextDecoder'); + const decoder = new tdec(); + for (let i = 0; i < runs; ++i) { + const out = decoder.decode(saved); + output = out; + } + console.timeEnd(name + '.TextDecoder'); + } + let saved, output; console.time('utf8.encode'); @@ -36,63 +57,10 @@ throw new Error('utf8 got wrong answer'); } - console.time('js-utf.encode'); - const out = new Uint8Array(1024 * 1024); - for (let i = 0; i < runs; ++i) { - const len = utf.encode(text, out); - saved = out.slice(0, len); - } - console.timeEnd('js-utf.encode'); - console.info(saved); - const polyfillSaved = saved; - - console.time('js-utf.decode'); - for (let i = 0; i < runs; ++i) { - const out = utf.decode(saved); - output = out; - } - console.timeEnd('js-utf.decode'); - if (output !== text) { - console.info('expected output length', text.length, 'was', output.length); - throw new Error('js-utf got wrong answer'); - } - - - console.time('TextEncoder'); - const encoder = new TextEncoder(); - for (let i = 0; i < runs; ++i) { - const out = encoder.encode(text); - saved = out; - } - console.timeEnd('TextEncoder'); - console.info(saved); - - console.time('TextDecoder'); - const decoder = new TextDecoder(); - for (let i = 0; i < runs; ++i) { - const out = decoder.decode(polyfillSaved); - output = out; - } - console.timeEnd('TextDecoder'); - - - console.time('polyfill.TextEncoder'); - const encoderP = new polyfill.TextEncoder(); - for (let i = 0; i < runs; ++i) { - const out = encoderP.encode(text); - saved = out; - } - console.timeEnd('polyfill.TextEncoder'); - console.info(saved); - - console.time('polyfill.TextDecoder'); - const decoderP = new polyfill.TextDecoder(); - for (let i = 0; i < runs; ++i) { - const out = decoderP.decode(polyfillSaved); - output = out; - } - console.timeEnd('polyfill.TextDecoder'); + testEncodeDecode('native', TextEncoder, TextDecoder); + testEncodeDecode('fast', TextEncoderPolyfill, TextDecoderPolyfill); + testEncodeDecode('polyfill', polyfill.TextEncoder, polyfill.TextDecoder); }()); diff --git a/text.js b/text.js new file mode 100644 index 0000000..07797c0 --- /dev/null +++ b/text.js @@ -0,0 +1,199 @@ +/* + * Copyright 2017 Sam Thorogood. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +/** + * @fileoverview Polyfill for TextEncoder and TextDecoder. + * + * You probably want `text.min.js`, and not this file directly. + */ + +(function(scope) { +'use strict'; + +// fail early +if (scope['TextEncoder'] && scope['TextDecoder']) { + return false; +} + +/** + * @constructor + * @param {string=} utfLabel + */ +function TextEncoder(utfLabel='utf-8') { + if (utfLabel !== 'utf-8') { + throw new RangeError( + `Failed to construct 'TextEncoder': The encoding label provided ('${utfLabel}') is invalid.`); + } +} + +/** + * @type {string} + */ +Object.defineProperty(TextEncoder.prototype, 'encoding', {value: 'utf-8'}); + +/** + * @param {string} string + * @param {{stream: boolean}=} options + * @return {!Uint8Array} + */ +TextEncoder.prototype.encode = function(string, options={}) { + if (options['stream']) { + throw new Error(`Failed to encode: the 'stream' option is unsupported.`); + } + + let pos = 0; + const len = string.length; + const out = []; + + let at = 0; // output position + let tlen = Math.max(32, len + (len >> 1) + 7); // 1.5x size + let target = new Uint8Array((tlen >> 3) << 3); // ... but at 8 byte offset + + while (pos < len) { + let value = string.charCodeAt(pos++); + if (value >= 0xd800 && value <= 0xdbff) { + // high surrogate + if (pos < len) { + const extra = string.charCodeAt(pos); + if ((extra & 0xfc00) === 0xdc00) { + ++pos; + value = ((value & 0x3ff) << 10) + (extra & 0x3ff) + 0x10000; + } + } + if (value >= 0xd800 && value <= 0xdbff) { + continue; // drop lone surrogate + } + } + + // expand the buffer if we couldn't write 4 bytes + if (at + 4 > target.length) { + tlen += 8; // minimum extra + tlen *= (1.0 + (pos / string.length) * 2); // take 2x the remaining + tlen = (tlen >> 3) << 3; // 8 byte offset + + const update = new Uint8Array(tlen); + update.set(target); + target = update; + } + + if ((value & 0xffffff80) === 0) { // 1-byte + target[at++] = value; // ASCII + continue; + } else if ((value & 0xfffff800) === 0) { // 2-byte + target[at++] = ((value >> 6) & 0x1f) | 0xc0; + } else if ((value & 0xffff0000) === 0) { // 3-byte + target[at++] = ((value >> 12) & 0x0f) | 0xe0; + target[at++] = ((value >> 6) & 0x3f) | 0x80; + } else if ((value & 0xffe00000) === 0) { // 4-byte + target[at++] = ((value >> 18) & 0x07) | 0xf0; + target[at++] = ((value >> 12) & 0x3f) | 0x80; + target[at++] = ((value >> 6) & 0x3f) | 0x80; + } else { + // FIXME: do we care + continue; + } + + target[at++] = (value & 0x3f) | 0x80; + } + + return target.slice(0, at); +} + +/** + * @constructor + * @param {string=} utfLabel + * @param {{fatal: boolean}=} options + */ +function TextDecoder(utfLabel='utf-8', options={}) { + if (utfLabel !== 'utf-8') { + throw new RangeError( + `Failed to construct 'TextDecoder': The encoding label provided ('${utfLabel}') is invalid.`); + } + if (options['fatal']) { + throw new Error(`Failed to construct 'TextDecoder': the 'fatal' option is unsupported.`); + } +} + +/** + * @type {string} + */ +Object.defineProperty(TextDecoder.prototype, 'encoding', {value: 'utf-8'}); + +/** + * @type {boolean} + */ +Object.defineProperty(TextDecoder.prototype, 'fatal', {value: false}); + +/** + * @type {boolean} + */ +Object.defineProperty(TextDecoder.prototype, 'ignoreBOM', {value: false}); + +/** + * @param {(!ArrayBuffer|!ArrayBufferView)} buffer + * @param {{stream: boolean}} options + */ +TextDecoder.prototype.decode = function(buffer, options={}) { + if (options['stream']) { + throw new Error(`Failed to decode: the 'stream' option is unsupported.`); + } + + const bytes = new Uint8Array(buffer); + let pos = 0; + const len = bytes.length; + const out = []; + + while (pos < len) { + const byte1 = bytes[pos++]; + if (byte1 === 0) { + break; // NULL + } + + if ((byte1 & 0x80) === 0) { // 1-byte + out.push(byte1); + } else if ((byte1 & 0xe0) === 0xc0) { // 2-byte + const byte2 = bytes[pos++] & 0x3f; + out.push(((byte1 & 0x1f) << 6) | byte2); + } else if ((byte1 & 0xf0) === 0xe0) { + const byte2 = bytes[pos++] & 0x3f; + const byte3 = bytes[pos++] & 0x3f; + out.push(((byte1 & 0x1f) << 12) | (byte2 << 6) | byte3); + } else if ((byte1 & 0xf8) === 0xf0) { + const byte2 = bytes[pos++] & 0x3f; + const byte3 = bytes[pos++] & 0x3f; + const byte4 = bytes[pos++] & 0x3f; + + // this can be > 0xffff, so possibly generate surrogates + let codepoint = ((byte1 & 0x07) << 0x12) | (byte2 << 0x0c) | (byte3 << 0x06) | byte4; + if (codepoint > 0xffff) { + // codepoint &= ~0x10000; + codepoint -= 0x10000; + out.push((codepoint >>> 10) & 0x3ff | 0xd800) + codepoint = 0xdc00 | codepoint & 0x3ff; + } + out.push(codepoint); + } else { + // FIXME: we're ignoring this + } + } + + return String.fromCharCode(...out); +} + +scope['TextEncoder'] = TextEncoder; +scope['TextDecoder'] = TextDecoder; + +}(typeof window !== 'undefined' ? window : (typeof global !== 'undefined' ? global : this)));