try esm

samthor · Aug 29, 2022 · de7f3bc · de7f3bc
1 parent e7dff43
commit de7f3bc
Show file tree

Hide file tree

Showing 13 changed files with 951 additions and 23 deletions.
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -11,6 +11,8 @@
     "test": "node --test ./test.mjs"
   },
   "devDependencies": {
+    "@types/node": "^18.7.13",
+    "esbuild": "^0.15.5",
     "google-closure-compiler": "^20220601.0.0"
   }
 }
diff --git a/src/buffer.js b/src/buffer.js
@@ -0,0 +1,23 @@
+
+/**
+ * @param {Uint8Array} bytes
+ * @param {string} encoding
+ * @return {string}
+ */
+export function decodeBuffer(bytes, encoding) {
+  /** @type {Buffer} */
+  let b;
+  if (bytes instanceof Buffer) {
+    b = bytes;
+  } else {
+    b = Buffer.from(bytes.buffer, bytes.byteOffset, bytes.byteLength);
+  }
+  return b.toString(/** @type {BufferEncoding} */(encoding));
+}
+
+
+/**
+ * @param {string} string
+ * @return {Uint8Array}
+ */
+export const encodeBuffer = (string) => Buffer.from(string);
diff --git a/src/lowlevel.js b/src/lowlevel.js
@@ -0,0 +1,141 @@
+
+/**
+ * @param {Uint8Array} bytes
+ * @return {string}
+ */
+export function decodeFallback(bytes) {
+  let inputIndex = 0;
+
+  // Create a working buffer for UTF-16 code points, but don't generate one
+  // which is too large for small input sizes. UTF-8 to UCS-16 conversion is
+  // going to be at most 1:1, if all code points are ASCII. The other extreme
+  // is 4-byte UTF-8, which results in two UCS-16 points, but this is still 50%
+  // fewer entries in the output.
+  const pendingSize = Math.min(256 * 256, bytes.length + 1);
+  const pending = new Uint16Array(pendingSize);
+  const chunks = [];
+  let pendingIndex = 0;
+
+  for (; ;) {
+    const more = inputIndex < bytes.length;
+
+    // If there's no more data or there'd be no room for two UTF-16 values,
+    // create a chunk. This isn't done at the end by simply slicing the data
+    // into equal sized chunks as we might hit a surrogate pair.
+    if (!more || (pendingIndex >= pendingSize - 1)) {
+      // nb. .apply and friends are *really slow*. Low-hanging fruit is to
+      // expand this to literally pass pending[0], pending[1], ... etc, but
+      // the output code expands pretty fast in this case.
+      // These extra vars get compiled out: they're just to make TS happy.
+      // Turns out you can pass an ArrayLike to .apply().
+      const subarray = pending.subarray(0, pendingIndex);
+      const arraylike = /** @type {number[]} */ (/** @type {unknown} */ (subarray));
+      chunks.push(String.fromCharCode.apply(null, arraylike));
+
+      if (!more) {
+        return chunks.join('');
+      }
+
+      // Move the buffer forward and create another chunk.
+      bytes = bytes.subarray(inputIndex);
+      inputIndex = 0;
+      pendingIndex = 0;
+    }
+
+    // The native TextDecoder will generate "REPLACEMENT CHARACTER" where the
+    // input data is invalid. Here, we blindly parse the data even if it's
+    // wrong: e.g., if a 3-byte sequence doesn't have two valid continuations.
+
+    const byte1 = bytes[inputIndex++];
+    if ((byte1 & 0x80) === 0) {  // 1-byte or null
+      pending[pendingIndex++] = byte1;
+    } else if ((byte1 & 0xe0) === 0xc0) {  // 2-byte
+      const byte2 = bytes[inputIndex++] & 0x3f;
+      pending[pendingIndex++] = ((byte1 & 0x1f) << 6) | byte2;
+    } else if ((byte1 & 0xf0) === 0xe0) {  // 3-byte
+      const byte2 = bytes[inputIndex++] & 0x3f;
+      const byte3 = bytes[inputIndex++] & 0x3f;
+      pending[pendingIndex++] = ((byte1 & 0x1f) << 12) | (byte2 << 6) | byte3;
+    } else if ((byte1 & 0xf8) === 0xf0) {  // 4-byte
+      const byte2 = bytes[inputIndex++] & 0x3f;
+      const byte3 = bytes[inputIndex++] & 0x3f;
+      const byte4 = bytes[inputIndex++] & 0x3f;
+
+      // this can be > 0xffff, so possibly generate surrogates
+      let codepoint = ((byte1 & 0x07) << 0x12) | (byte2 << 0x0c) | (byte3 << 0x06) | byte4;
+      if (codepoint > 0xffff) {
+        // codepoint &= ~0x10000;
+        codepoint -= 0x10000;
+        pending[pendingIndex++] = (codepoint >>> 10) & 0x3ff | 0xd800;
+        codepoint = 0xdc00 | codepoint & 0x3ff;
+      }
+      pending[pendingIndex++] = codepoint;
+    } else {
+      // invalid initial byte
+    }
+  }
+}
+
+
+/**
+ * @param {string} string
+ * @return {Uint8Array}
+ */
+export function encodeFallback(string) {
+  let pos = 0;
+  const len = string.length;
+
+  let at = 0;  // output position
+  let tlen = Math.max(32, len + (len >>> 1) + 7);  // 1.5x size
+  let target = new Uint8Array((tlen >>> 3) << 3);  // ... but at 8 byte offset
+
+  while (pos < len) {
+    let value = string.charCodeAt(pos++);
+    if (value >= 0xd800 && value <= 0xdbff) {
+      // high surrogate
+      if (pos < len) {
+        const extra = string.charCodeAt(pos);
+        if ((extra & 0xfc00) === 0xdc00) {
+          ++pos;
+          value = ((value & 0x3ff) << 10) + (extra & 0x3ff) + 0x10000;
+        }
+      }
+      if (value >= 0xd800 && value <= 0xdbff) {
+        continue;  // drop lone surrogate
+      }
+    }
+
+    // expand the buffer if we couldn't write 4 bytes
+    if (at + 4 > target.length) {
+      tlen += 8;  // minimum extra
+      tlen *= (1.0 + (pos / string.length) * 2);  // take 2x the remaining
+      tlen = (tlen >>> 3) << 3;  // 8 byte offset
+
+      const update = new Uint8Array(tlen);
+      update.set(target);
+      target = update;
+    }
+
+    if ((value & 0xffffff80) === 0) {  // 1-byte
+      target[at++] = value;  // ASCII
+      continue;
+    } else if ((value & 0xfffff800) === 0) {  // 2-byte
+      target[at++] = ((value >>> 6) & 0x1f) | 0xc0;
+    } else if ((value & 0xffff0000) === 0) {  // 3-byte
+      target[at++] = ((value >>> 12) & 0x0f) | 0xe0;
+      target[at++] = ((value >>> 6) & 0x3f) | 0x80;
+    } else if ((value & 0xffe00000) === 0) {  // 4-byte
+      target[at++] = ((value >>> 18) & 0x07) | 0xf0;
+      target[at++] = ((value >>> 12) & 0x3f) | 0x80;
+      target[at++] = ((value >>> 6) & 0x3f) | 0x80;
+    } else {
+      continue;  // out of range
+    }
+
+    target[at++] = (value & 0x3f) | 0x80;
+  }
+
+  // Use subarray if slice isn't supported (IE11). This will use more memory
+  // because the original array still exists.
+  return target.slice ? target.slice(0, at) : target.subarray(0, at);
+}
diff --git a/src/o-decoder.js b/src/o-decoder.js
@@ -0,0 +1,78 @@
+import { decodeBuffer } from './buffer.js';
+import { decodeFallback } from './lowlevel.js';
+import { failedToString, maybeThrowFailedToOption } from './shared.js';
+import { hasBufferFrom } from './support.js';
+import { decodeSyncXHR } from './xhr.js';
+
+const trySyncXHR = !hasBufferFrom && (typeof Blob === 'function' && typeof URL === 'function' && typeof URL.createObjectURL === 'function');
+const validUtfLabels = ['utf-8', 'utf8', 'unicode-1-1-utf-8'];
+
+/** @type {(bytes: Uint8Array, encoding: string) => string} */
+let decodeImpl = decodeFallback;
+if (hasBufferFrom) {
+  decodeImpl = decodeBuffer;
+} else if (trySyncXHR) {
+  decodeImpl = (string) => {
+    try {
+      return decodeSyncXHR(string);
+    } catch (e) {
+      return decodeFallback(string);
+    }
+  };
+}
+
+
+const errorPrefix = `${failedToString} construct 'TextDecoder': the `;
+
+
+/**
+ * @constructor
+ * @param {string=} utfLabel
+ * @param {{fatal: boolean}=} options
+ */
+export function FastTextDecoder(utfLabel = 'utf-8', options) {
+  maybeThrowFailedToOption(options && options.fatal, `construct 'TextDecoder'`, 'fatal');
+
+  /** @type {boolean} */
+  let ok;
+  if (hasBufferFrom) {
+    ok = Buffer.isEncoding(utfLabel);
+  } else {
+    ok = validUtfLabels.indexOf(utfLabel.toLowerCase()) !== -1;
+  }
+  if (!ok) {
+    throw new RangeError(`${errorPrefix} encoding label provided ('${utfLabel}') is invalid.`);
+  }
+
+  this.encoding = utfLabel;
+  this.fatal = false;
+  this.ignoreBOM = false;
+}
+
+/**
+ * @param {(ArrayBuffer|ArrayBufferView)} buffer
+ * @param {{stream: boolean}=} options
+ * @return {string}
+ */
+FastTextDecoder.prototype.decode = function (buffer, options) {
+  maybeThrowFailedToOption(options && options.stream, 'decode', 'stream');
+
+  let bytes;
+
+  if (buffer instanceof Uint8Array) {
+    // Accept Uint8Array instances as-is. This is also a Node buffer.
+    bytes = buffer;
+  } else if (buffer['buffer'] instanceof ArrayBuffer) {
+    // Look for ArrayBufferView, which isn't a real type, but basically
+    // represents all the valid TypedArray types plus DataView. They all have
+    // ".buffer" as an instance of ArrayBuffer.
+    bytes = new Uint8Array(/** @type {ArrayBufferView} */(buffer).buffer);
+  } else {
+    // The only other valid argument here is that "buffer" is an ArrayBuffer.
+    // We also try to convert anything else passed to a Uint8Array, as this
+    // catches anything that's array-like. Native code would throw here.
+    bytes = new Uint8Array(/** @type {any} */(buffer));
+  }
+
+  return decodeImpl(bytes, this.encoding);
+};
diff --git a/src/o-encoder.js b/src/o-encoder.js
@@ -0,0 +1,25 @@
+import { encodeBuffer } from './buffer.js';
+import { encodeFallback } from './lowlevel.js';
+import { maybeThrowFailedToOption } from './shared.js';
+import { hasBufferFrom } from './support.js';
+
+export const encodeImpl = hasBufferFrom ? encodeFallback : encodeBuffer;
+
+/**
+ * @constructor
+ */
+export function FastTextEncoder() {
+  // This does not accept an encoding, and always uses UTF-8:
+  //   https://www.w3.org/TR/encoding/#dom-textencoder
+  this.encoding = 'utf-8';
+}
+
+/**
+ * @param {string} string
+ * @param {{stream: boolean}=} options
+ * @return {Uint8Array}
+ */
+FastTextEncoder.prototype.encode = function (string, options) {
+  maybeThrowFailedToOption(options && options.stream, 'encode', 'stream');
+  return encodeImpl(string);
+};
diff --git a/src/polyfill.js b/src/polyfill.js
@@ -0,0 +1,11 @@
+
+import { FastTextEncoder } from './o-encoder.js';
+import { FastTextDecoder } from './o-decoder.js';
+
+/** @type {object} */
+const scope = typeof window !== 'undefined' ? window : (typeof global !== 'undefined' ? global : this);
+
+scope['TextEncoder'] = scope['TextEncoder'] || FastTextEncoder;
+scope['TextDecoder'] = scope['TextDecoder'] || FastTextDecoder;
+
+export {};
diff --git a/src/shared.js b/src/shared.js
@@ -0,0 +1,13 @@
+
+export const failedToString = 'Failed to ';
+
+/**
+ * @param {boolean|undefined} check 
+ * @param {string} operation 
+ * @param {string} fieldName 
+ */
+export const maybeThrowFailedToOption = (check, operation, fieldName) => {
+  if (check) {
+    throw new Error(`${failedToString}${operation}: the '${fieldName}' option is unsupported.`);
+  }
+};
diff --git a/src/support.js b/src/support.js
@@ -0,0 +1,2 @@
+
+export const hasBufferFrom = (typeof Buffer === 'function' && Buffer.from);
diff --git a/src/xhr.js b/src/xhr.js
@@ -0,0 +1,29 @@
+
+/**
+ * This is a horrible hack which works in some old browsers. We can tell them to decode bytes via
+ * sync XHR.
+ *
+ * Throws if fails. Should be wrapped in something to check that.
+ *
+ * @param {Uint8Array} bytes
+ * @return {string}
+ */
+export function decodeSyncXHR(bytes) {
+  let u;
+
+  // This hack will fail in non-Edgium Edge because sync XHRs are disabled (and
+  // possibly in other places), so ensure there's a fallback call.
+  try {
+    const b = new Blob([bytes], { type: 'text/plain;charset=UTF-8' });
+    u = URL.createObjectURL(b);
+
+    const x = new XMLHttpRequest();
+    x.open('GET', u, false);
+    x.send();
+    return x.responseText;
+  } finally {
+    if (u) {
+      URL.revokeObjectURL(u);
+    }
+  }
+}
diff --git a/test.mjs b/test.mjs
@@ -17,16 +17,16 @@ import assert from 'node:assert';
  * @param {typeof TextEncoder} TextEncoder
  * @param {typeof TextDecoder} TextDecoder
  */
-export function tests(isNative, TextEncoder, TextDecoder) {
+export async function tests(isNative, TextEncoder, TextDecoder) {
   const dec = new TextDecoder();
   const enc = new TextEncoder('utf-8');
 
   console.info('running', { isNative, TextEncoder, TextDecoder });
 
-  test(isNative ? 'native suite' : 'polyfill suite', async (c) => {
+  await test(isNative ? 'native suite' : 'polyfill suite', async (c) => {
     const test = c.test.bind(c);
 
-    await test('really large string', () => {
+    await test('really large string', async () => {
       const chunks = new Array(64);
       for (let i = 0; i < chunks.length; ++i) {
         const s = new Array(65535).fill('x'.charCodeAt(0));
@@ -106,6 +106,10 @@ export function tests(isNative, TextEncoder, TextDecoder) {
         assert.deepEqual(dec.decode(buffer), s);
       });
 
+      await test('nodejs encodings', () => {
+        const d = new TextDecoder('utf16le');
+      });
+
     });
 
     await test('encoder', async (c) => {
@@ -160,5 +164,5 @@ export function tests(isNative, TextEncoder, TextDecoder) {
 }
 
 
-tests(true, NativeTextEncoder, NativeTextDecoder);
-tests(false, TextEncoder, TextDecoder);
+await tests(true, NativeTextEncoder, NativeTextDecoder);
+await tests(false, TextEncoder, TextDecoder);
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@

		export const hasBufferFrom = (typeof Buffer === 'function' && Buffer.from);