From 1ea7660dee95ba8252d40782d6bdbb5cc226d99a Mon Sep 17 00:00:00 2001
From: Sam Thorogood <sam.thorogood@gmail.com>
Date: Fri, 1 Sep 2017 14:29:41 +1000
Subject: [PATCH] rewrite to polyfill of TextEncoder/TextDecoder

---
 js-utf.js | 101 ---------------------------
 test.html |  84 +++++++----------------
 text.js   | 199 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 225 insertions(+), 159 deletions(-)
 delete mode 100644 js-utf.js
 create mode 100644 text.js
diff --git a/js-utf.js b/js-utf.js
deleted file mode 100644
index 0a9045a..0000000
--- a/js-utf.js
+++ /dev/null
@@ -1,101 +0,0 @@
-/**
- * @param {string} string
- * @param {!Array<number>|!TypedArray} target
- * @param {number=} at position to write into target
- * @return {number} the number of bytes written
- */
-export function encode(string, target, at=0) {
-  const start = at;
-  let pos = 0;
-  const len = string.length;
-  const out = [];
-
-  while (pos < len) {
-    let value = string.charCodeAt(pos++);
-    if (value >= 0xd800 && value <= 0xdbff) {
-      // high surrogate
-      if (pos < len) {
-        const extra = string.charCodeAt(pos);
-        if ((extra & 0xfc00) === 0xdc00) {
-          ++pos;
-          value = ((value & 0x3ff) << 10) + (extra & 0x3ff) + 0x10000;
-        }
-      }
-      if (value >= 0xd800 && value <= 0xdbff) {
-        continue;  // drop lone surrogate
-      }
-    }
-
-    if ((value & 0xffffff80) === 0) {  // 1-byte
-      target[at++] = value;  // ASCII
-      continue;
-    } else if ((value & 0xfffff800) === 0) {  // 2-byte
-      target[at++] = ((value >>  6) & 0x1f) | 0xc0;
-    } else if ((value & 0xffff0000) === 0) {  // 3-byte
-      target[at++] = ((value >> 12) & 0x0f) | 0xe0;
-      target[at++] = ((value >>  6) & 0x3f) | 0x80;
-    } else if ((value & 0xffe00000) === 0) {  // 4-byte
-      target[at++] = ((value >> 18) & 0x07) | 0xf0;
-      target[at++] = ((value >> 12) & 0x3f) | 0x80;
-      target[at++] = ((value >>  6) & 0x3f) | 0x80;
-    } else { 
-      // FIXME: do we care
-      continue;
-    }
-
-    target[at++] = (value & 0x3f) | 0x80;
-  }
-
-  return at - start;
-}
-
-/**
- * @param {!Array<number>|!TypedArray} bytes
- * @return {string}
- */
-export function decode(bytes, pos=0, len=-1) {
-  const start = pos;
-  const out = [];
-
-  if (len < 0) {
-    len = bytes.length;
-  } else {
-    len = Math.min(len, bytes.length);
-  }
-
-  while (pos < len) {
-    const byte1 = bytes[pos++];
-    if (!byte1) {
-      break;  // NULL or null-like
-    }
-  
-    if ((byte1 & 0x80) === 0) {  // 1-byte
-      out.push(byte1);
-    } else if ((byte1 & 0xe0) === 0xc0) {  // 2-byte
-      const byte2 = bytes[pos++] & 0x3f;
-      out.push(((byte1 & 0x1f) << 6) | byte2);
-    } else if ((byte1 & 0xf0) === 0xe0) {
-      const byte2 = bytes[pos++] & 0x3f;
-      const byte3 = bytes[pos++] & 0x3f;
-      out.push(((byte1 & 0x1f) << 12) | (byte2 << 6) | byte3);
-    } else if ((byte1 & 0xf8) === 0xf0) {
-      const byte2 = bytes[pos++] & 0x3f;
-      const byte3 = bytes[pos++] & 0x3f;
-      const byte4 = bytes[pos++] & 0x3f;
-
-      // this can be > 0xffff, so possibly generate surrogates
-      let codepoint = ((byte1 & 0x07) << 0x12) | (byte2 << 0x0c) | (byte3 << 0x06) | byte4;
-      if (codepoint > 0xffff) {
-        // codepoint &= ~0x10000;
-        codepoint -= 0x10000;
-        out.push((codepoint >>> 10) & 0x3ff | 0xd800)
-        codepoint = 0xdc00 | codepoint & 0x3ff;
-      }
-      out.push(codepoint);
-    } else {
-      // FIXME: we're ignoring this
-    }
-  }
-
-  return String.fromCharCode(...out);
-}
\ No newline at end of file
diff --git a/test.html b/test.html
index 3dbf66e..a80122f 100644
--- a/test.html
+++ b/test.html
@@ -9,14 +9,35 @@
   import 'https://cdn.rawgit.com/inexorabletash/text-encoding/b98ab30b/lib/encoding.js';
   const polyfill = module.exports;
 
-  import * as utf from './js-utf.js';
+  import './text.js';
 
-  const runs = 100;
+  const runs = 1;
   const dataUrl = './utf8_sequence_0-0xffff_assigned_printable.txt';
 
   (async function() {
     let text = await window.fetch(dataUrl).then((data) => data.text());
     text = text.substr(0, 10000);
+
+    function testEncodeDecode(name, tenc, tdec) {
+      console.time(name + '.TextEncoder');
+      let saved;
+      const encoder = new tenc();
+      for (let i = 0; i < runs; ++i) {
+        const out = encoder.encode(text);
+        saved = out;
+      }
+      console.info('got output', saved);
+      console.timeEnd(name + '.TextEncoder');
+
+      console.time(name + '.TextDecoder');
+      const decoder = new tdec();
+      for (let i = 0; i < runs; ++i) {
+        const out = decoder.decode(saved);
+        output = out;
+      }
+      console.timeEnd(name + '.TextDecoder');
+    }
+
     let saved, output;
 
     console.time('utf8.encode');
@@ -36,63 +57,10 @@
       throw new Error('utf8 got wrong answer');
     }
 
-    console.time('js-utf.encode');
-    const out = new Uint8Array(1024 * 1024);
-    for (let i = 0; i < runs; ++i) {
-      const len = utf.encode(text, out);
-      saved = out.slice(0, len);
-    }
-    console.timeEnd('js-utf.encode');
-    console.info(saved);
-    const polyfillSaved = saved;
-
-    console.time('js-utf.decode');
-    for (let i = 0; i < runs; ++i) {
-      const out = utf.decode(saved);
-      output = out;
-    }
-    console.timeEnd('js-utf.decode');
-    if (output !== text) {
-      console.info('expected output length', text.length, 'was', output.length);
-      throw new Error('js-utf got wrong answer');
-    }
-
-
-    console.time('TextEncoder');
-    const encoder = new TextEncoder();
-    for (let i = 0; i < runs; ++i) {
-      const out = encoder.encode(text);
-      saved = out;
-    }
-    console.timeEnd('TextEncoder');
-    console.info(saved);
-
-    console.time('TextDecoder');
-    const decoder = new TextDecoder();
-    for (let i = 0; i < runs; ++i) {
-      const out = decoder.decode(polyfillSaved);
-      output = out;
-    }
-    console.timeEnd('TextDecoder');
-
-
-    console.time('polyfill.TextEncoder');
-    const encoderP = new polyfill.TextEncoder();
-    for (let i = 0; i < runs; ++i) {
-      const out = encoderP.encode(text);
-      saved = out;
-    }
-    console.timeEnd('polyfill.TextEncoder');
-    console.info(saved);
-
-    console.time('polyfill.TextDecoder');
-    const decoderP = new polyfill.TextDecoder();
-    for (let i = 0; i < runs; ++i) {
-      const out = decoderP.decode(polyfillSaved);
-      output = out;
-    }
-    console.timeEnd('polyfill.TextDecoder');
 
+    testEncodeDecode('native', TextEncoder, TextDecoder);
+    testEncodeDecode('fast', TextEncoderPolyfill, TextDecoderPolyfill);
+    testEncodeDecode('polyfill', polyfill.TextEncoder, polyfill.TextDecoder);
 
   }());
 
diff --git a/text.js b/text.js
new file mode 100644
index 0000000..07797c0
--- /dev/null
+++ b/text.js
@@ -0,0 +1,199 @@
+/*
+ * Copyright 2017 Sam Thorogood. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+/**
+ * @fileoverview Polyfill for TextEncoder and TextDecoder.
+ *
+ * You probably want `text.min.js`, and not this file directly.
+ */
+
+(function(scope) {
+'use strict';
+
+// fail early
+if (scope['TextEncoder'] && scope['TextDecoder']) {
+  return false;
+}
+
+/**
+ * @constructor
+ * @param {string=} utfLabel
+ */
+function TextEncoder(utfLabel='utf-8') {
+  if (utfLabel !== 'utf-8') {
+    throw new RangeError(
+      `Failed to construct 'TextEncoder': The encoding label provided ('${utfLabel}') is invalid.`);
+  }
+}
+
+/**
+ * @type {string}
+ */
+Object.defineProperty(TextEncoder.prototype, 'encoding', {value: 'utf-8'});
+
+/**
+ * @param {string} string
+ * @param {{stream: boolean}=} options
+ * @return {!Uint8Array}
+ */
+TextEncoder.prototype.encode = function(string, options={}) {
+  if (options['stream']) {
+    throw new Error(`Failed to encode: the 'stream' option is unsupported.`);
+  }
+
+  let pos = 0;
+  const len = string.length;
+  const out = [];
+
+  let at = 0;  // output position
+  let tlen = Math.max(32, len + (len >> 1) + 7);  // 1.5x size
+  let target = new Uint8Array((tlen >> 3) << 3);  // ... but at 8 byte offset
+
+  while (pos < len) {
+    let value = string.charCodeAt(pos++);
+    if (value >= 0xd800 && value <= 0xdbff) {
+      // high surrogate
+      if (pos < len) {
+        const extra = string.charCodeAt(pos);
+        if ((extra & 0xfc00) === 0xdc00) {
+          ++pos;
+          value = ((value & 0x3ff) << 10) + (extra & 0x3ff) + 0x10000;
+        }
+      }
+      if (value >= 0xd800 && value <= 0xdbff) {
+        continue;  // drop lone surrogate
+      }
+    }
+
+    // expand the buffer if we couldn't write 4 bytes
+    if (at + 4 > target.length) {
+      tlen += 8;  // minimum extra
+      tlen *= (1.0 + (pos / string.length) * 2);  // take 2x the remaining
+      tlen = (tlen >> 3) << 3;  // 8 byte offset
+
+      const update = new Uint8Array(tlen);
+      update.set(target);
+      target = update;
+    }
+
+    if ((value & 0xffffff80) === 0) {  // 1-byte
+      target[at++] = value;  // ASCII
+      continue;
+    } else if ((value & 0xfffff800) === 0) {  // 2-byte
+      target[at++] = ((value >>  6) & 0x1f) | 0xc0;
+    } else if ((value & 0xffff0000) === 0) {  // 3-byte
+      target[at++] = ((value >> 12) & 0x0f) | 0xe0;
+      target[at++] = ((value >>  6) & 0x3f) | 0x80;
+    } else if ((value & 0xffe00000) === 0) {  // 4-byte
+      target[at++] = ((value >> 18) & 0x07) | 0xf0;
+      target[at++] = ((value >> 12) & 0x3f) | 0x80;
+      target[at++] = ((value >>  6) & 0x3f) | 0x80;
+    } else {
+      // FIXME: do we care
+      continue;
+    }
+
+    target[at++] = (value & 0x3f) | 0x80;
+  }
+
+  return target.slice(0, at);
+}
+
+/**
+ * @constructor
+ * @param {string=} utfLabel
+ * @param {{fatal: boolean}=} options
+ */
+function TextDecoder(utfLabel='utf-8', options={}) {
+  if (utfLabel !== 'utf-8') {
+    throw new RangeError(
+      `Failed to construct 'TextDecoder': The encoding label provided ('${utfLabel}') is invalid.`);
+  }
+  if (options['fatal']) {
+    throw new Error(`Failed to construct 'TextDecoder': the 'fatal' option is unsupported.`);
+  }
+}
+
+/**
+ * @type {string}
+ */
+Object.defineProperty(TextDecoder.prototype, 'encoding', {value: 'utf-8'});
+
+/**
+ * @type {boolean}
+ */
+Object.defineProperty(TextDecoder.prototype, 'fatal', {value: false});
+
+/**
+ * @type {boolean}
+ */
+Object.defineProperty(TextDecoder.prototype, 'ignoreBOM', {value: false});
+
+/**
+ * @param {(!ArrayBuffer|!ArrayBufferView)} buffer
+ * @param {{stream: boolean}} options
+ */
+TextDecoder.prototype.decode = function(buffer, options={}) {
+  if (options['stream']) {
+    throw new Error(`Failed to decode: the 'stream' option is unsupported.`);
+  }
+
+  const bytes = new Uint8Array(buffer);
+  let pos = 0;
+  const len = bytes.length;
+  const out = [];
+
+  while (pos < len) {
+    const byte1 = bytes[pos++];
+    if (byte1 === 0) {
+      break;  // NULL
+    }
+  
+    if ((byte1 & 0x80) === 0) {  // 1-byte
+      out.push(byte1);
+    } else if ((byte1 & 0xe0) === 0xc0) {  // 2-byte
+      const byte2 = bytes[pos++] & 0x3f;
+      out.push(((byte1 & 0x1f) << 6) | byte2);
+    } else if ((byte1 & 0xf0) === 0xe0) {
+      const byte2 = bytes[pos++] & 0x3f;
+      const byte3 = bytes[pos++] & 0x3f;
+      out.push(((byte1 & 0x1f) << 12) | (byte2 << 6) | byte3);
+    } else if ((byte1 & 0xf8) === 0xf0) {
+      const byte2 = bytes[pos++] & 0x3f;
+      const byte3 = bytes[pos++] & 0x3f;
+      const byte4 = bytes[pos++] & 0x3f;
+
+      // this can be > 0xffff, so possibly generate surrogates
+      let codepoint = ((byte1 & 0x07) << 0x12) | (byte2 << 0x0c) | (byte3 << 0x06) | byte4;
+      if (codepoint > 0xffff) {
+        // codepoint &= ~0x10000;
+        codepoint -= 0x10000;
+        out.push((codepoint >>> 10) & 0x3ff | 0xd800)
+        codepoint = 0xdc00 | codepoint & 0x3ff;
+      }
+      out.push(codepoint);
+    } else {
+      // FIXME: we're ignoring this
+    }
+  }
+
+  return String.fromCharCode(...out);
+}
+
+scope['TextEncoder'] = TextEncoder;
+scope['TextDecoder'] = TextDecoder;
+
+}(typeof window !== 'undefined' ? window : (typeof global !== 'undefined' ? global : this)));