Skip to content

Commit

Permalink
try esm
Browse files Browse the repository at this point in the history
  • Loading branch information
samthor committed Aug 29, 2022
1 parent e7dff43 commit de7f3bc
Show file tree
Hide file tree
Showing 13 changed files with 951 additions and 23 deletions.
562 changes: 562 additions & 0 deletions package-lock.json

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
"test": "node --test ./test.mjs"
},
"devDependencies": {
"@types/node": "^18.7.13",
"esbuild": "^0.15.5",
"google-closure-compiler": "^20220601.0.0"
}
}
23 changes: 23 additions & 0 deletions src/buffer.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@

/**
* @param {Uint8Array} bytes
* @param {string} encoding
* @return {string}
*/
export function decodeBuffer(bytes, encoding) {
/** @type {Buffer} */
let b;
if (bytes instanceof Buffer) {
b = bytes;
} else {
b = Buffer.from(bytes.buffer, bytes.byteOffset, bytes.byteLength);
}
return b.toString(/** @type {BufferEncoding} */(encoding));
}


/**
* @param {string} string
* @return {Uint8Array}
*/
export const encodeBuffer = (string) => Buffer.from(string);
141 changes: 141 additions & 0 deletions src/lowlevel.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@

/**
* @param {Uint8Array} bytes
* @return {string}
*/
export function decodeFallback(bytes) {
let inputIndex = 0;

// Create a working buffer for UTF-16 code points, but don't generate one
// which is too large for small input sizes. UTF-8 to UCS-16 conversion is
// going to be at most 1:1, if all code points are ASCII. The other extreme
// is 4-byte UTF-8, which results in two UCS-16 points, but this is still 50%
// fewer entries in the output.
const pendingSize = Math.min(256 * 256, bytes.length + 1);
const pending = new Uint16Array(pendingSize);
const chunks = [];
let pendingIndex = 0;

for (; ;) {
const more = inputIndex < bytes.length;

// If there's no more data or there'd be no room for two UTF-16 values,
// create a chunk. This isn't done at the end by simply slicing the data
// into equal sized chunks as we might hit a surrogate pair.
if (!more || (pendingIndex >= pendingSize - 1)) {
// nb. .apply and friends are *really slow*. Low-hanging fruit is to
// expand this to literally pass pending[0], pending[1], ... etc, but
// the output code expands pretty fast in this case.
// These extra vars get compiled out: they're just to make TS happy.
// Turns out you can pass an ArrayLike to .apply().
const subarray = pending.subarray(0, pendingIndex);
const arraylike = /** @type {number[]} */ (/** @type {unknown} */ (subarray));
chunks.push(String.fromCharCode.apply(null, arraylike));

if (!more) {
return chunks.join('');
}

// Move the buffer forward and create another chunk.
bytes = bytes.subarray(inputIndex);
inputIndex = 0;
pendingIndex = 0;
}

// The native TextDecoder will generate "REPLACEMENT CHARACTER" where the
// input data is invalid. Here, we blindly parse the data even if it's
// wrong: e.g., if a 3-byte sequence doesn't have two valid continuations.

const byte1 = bytes[inputIndex++];
if ((byte1 & 0x80) === 0) { // 1-byte or null
pending[pendingIndex++] = byte1;
} else if ((byte1 & 0xe0) === 0xc0) { // 2-byte
const byte2 = bytes[inputIndex++] & 0x3f;
pending[pendingIndex++] = ((byte1 & 0x1f) << 6) | byte2;
} else if ((byte1 & 0xf0) === 0xe0) { // 3-byte
const byte2 = bytes[inputIndex++] & 0x3f;
const byte3 = bytes[inputIndex++] & 0x3f;
pending[pendingIndex++] = ((byte1 & 0x1f) << 12) | (byte2 << 6) | byte3;
} else if ((byte1 & 0xf8) === 0xf0) { // 4-byte
const byte2 = bytes[inputIndex++] & 0x3f;
const byte3 = bytes[inputIndex++] & 0x3f;
const byte4 = bytes[inputIndex++] & 0x3f;

// this can be > 0xffff, so possibly generate surrogates
let codepoint = ((byte1 & 0x07) << 0x12) | (byte2 << 0x0c) | (byte3 << 0x06) | byte4;
if (codepoint > 0xffff) {
// codepoint &= ~0x10000;
codepoint -= 0x10000;
pending[pendingIndex++] = (codepoint >>> 10) & 0x3ff | 0xd800;
codepoint = 0xdc00 | codepoint & 0x3ff;
}
pending[pendingIndex++] = codepoint;
} else {
// invalid initial byte
}
}
}


/**
* @param {string} string
* @return {Uint8Array}
*/
export function encodeFallback(string) {
let pos = 0;
const len = string.length;

let at = 0; // output position
let tlen = Math.max(32, len + (len >>> 1) + 7); // 1.5x size
let target = new Uint8Array((tlen >>> 3) << 3); // ... but at 8 byte offset

while (pos < len) {
let value = string.charCodeAt(pos++);
if (value >= 0xd800 && value <= 0xdbff) {
// high surrogate
if (pos < len) {
const extra = string.charCodeAt(pos);
if ((extra & 0xfc00) === 0xdc00) {
++pos;
value = ((value & 0x3ff) << 10) + (extra & 0x3ff) + 0x10000;
}
}
if (value >= 0xd800 && value <= 0xdbff) {
continue; // drop lone surrogate
}
}

// expand the buffer if we couldn't write 4 bytes
if (at + 4 > target.length) {
tlen += 8; // minimum extra
tlen *= (1.0 + (pos / string.length) * 2); // take 2x the remaining
tlen = (tlen >>> 3) << 3; // 8 byte offset

const update = new Uint8Array(tlen);
update.set(target);
target = update;
}

if ((value & 0xffffff80) === 0) { // 1-byte
target[at++] = value; // ASCII
continue;
} else if ((value & 0xfffff800) === 0) { // 2-byte
target[at++] = ((value >>> 6) & 0x1f) | 0xc0;
} else if ((value & 0xffff0000) === 0) { // 3-byte
target[at++] = ((value >>> 12) & 0x0f) | 0xe0;
target[at++] = ((value >>> 6) & 0x3f) | 0x80;
} else if ((value & 0xffe00000) === 0) { // 4-byte
target[at++] = ((value >>> 18) & 0x07) | 0xf0;
target[at++] = ((value >>> 12) & 0x3f) | 0x80;
target[at++] = ((value >>> 6) & 0x3f) | 0x80;
} else {
continue; // out of range
}

target[at++] = (value & 0x3f) | 0x80;
}

// Use subarray if slice isn't supported (IE11). This will use more memory
// because the original array still exists.
return target.slice ? target.slice(0, at) : target.subarray(0, at);
}
78 changes: 78 additions & 0 deletions src/o-decoder.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import { decodeBuffer } from './buffer.js';
import { decodeFallback } from './lowlevel.js';
import { failedToString, maybeThrowFailedToOption } from './shared.js';
import { hasBufferFrom } from './support.js';
import { decodeSyncXHR } from './xhr.js';

const trySyncXHR = !hasBufferFrom && (typeof Blob === 'function' && typeof URL === 'function' && typeof URL.createObjectURL === 'function');
const validUtfLabels = ['utf-8', 'utf8', 'unicode-1-1-utf-8'];

/** @type {(bytes: Uint8Array, encoding: string) => string} */
let decodeImpl = decodeFallback;
if (hasBufferFrom) {
decodeImpl = decodeBuffer;
} else if (trySyncXHR) {
decodeImpl = (string) => {
try {
return decodeSyncXHR(string);
} catch (e) {
return decodeFallback(string);
}
};
}


const errorPrefix = `${failedToString} construct 'TextDecoder': the `;


/**
* @constructor
* @param {string=} utfLabel
* @param {{fatal: boolean}=} options
*/
export function FastTextDecoder(utfLabel = 'utf-8', options) {
maybeThrowFailedToOption(options && options.fatal, `construct 'TextDecoder'`, 'fatal');

/** @type {boolean} */
let ok;
if (hasBufferFrom) {
ok = Buffer.isEncoding(utfLabel);
} else {
ok = validUtfLabels.indexOf(utfLabel.toLowerCase()) !== -1;
}
if (!ok) {
throw new RangeError(`${errorPrefix} encoding label provided ('${utfLabel}') is invalid.`);
}

this.encoding = utfLabel;
this.fatal = false;
this.ignoreBOM = false;
}

/**
* @param {(ArrayBuffer|ArrayBufferView)} buffer
* @param {{stream: boolean}=} options
* @return {string}
*/
FastTextDecoder.prototype.decode = function (buffer, options) {
maybeThrowFailedToOption(options && options.stream, 'decode', 'stream');

let bytes;

if (buffer instanceof Uint8Array) {
// Accept Uint8Array instances as-is. This is also a Node buffer.
bytes = buffer;
} else if (buffer['buffer'] instanceof ArrayBuffer) {
// Look for ArrayBufferView, which isn't a real type, but basically
// represents all the valid TypedArray types plus DataView. They all have
// ".buffer" as an instance of ArrayBuffer.
bytes = new Uint8Array(/** @type {ArrayBufferView} */(buffer).buffer);
} else {
// The only other valid argument here is that "buffer" is an ArrayBuffer.
// We also try to convert anything else passed to a Uint8Array, as this
// catches anything that's array-like. Native code would throw here.
bytes = new Uint8Array(/** @type {any} */(buffer));
}

return decodeImpl(bytes, this.encoding);
};
25 changes: 25 additions & 0 deletions src/o-encoder.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import { encodeBuffer } from './buffer.js';
import { encodeFallback } from './lowlevel.js';
import { maybeThrowFailedToOption } from './shared.js';
import { hasBufferFrom } from './support.js';

export const encodeImpl = hasBufferFrom ? encodeFallback : encodeBuffer;

/**
* @constructor
*/
export function FastTextEncoder() {
// This does not accept an encoding, and always uses UTF-8:
// https://www.w3.org/TR/encoding/#dom-textencoder
this.encoding = 'utf-8';
}

/**
* @param {string} string
* @param {{stream: boolean}=} options
* @return {Uint8Array}
*/
FastTextEncoder.prototype.encode = function (string, options) {
maybeThrowFailedToOption(options && options.stream, 'encode', 'stream');
return encodeImpl(string);
};
11 changes: 11 additions & 0 deletions src/polyfill.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@

import { FastTextEncoder } from './o-encoder.js';
import { FastTextDecoder } from './o-decoder.js';

/** @type {object} */
const scope = typeof window !== 'undefined' ? window : (typeof global !== 'undefined' ? global : this);

scope['TextEncoder'] = scope['TextEncoder'] || FastTextEncoder;
scope['TextDecoder'] = scope['TextDecoder'] || FastTextDecoder;

export {};
13 changes: 13 additions & 0 deletions src/shared.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@

export const failedToString = 'Failed to ';

/**
* @param {boolean|undefined} check
* @param {string} operation
* @param {string} fieldName
*/
export const maybeThrowFailedToOption = (check, operation, fieldName) => {
if (check) {
throw new Error(`${failedToString}${operation}: the '${fieldName}' option is unsupported.`);
}
};
2 changes: 2 additions & 0 deletions src/support.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@

export const hasBufferFrom = (typeof Buffer === 'function' && Buffer.from);
29 changes: 29 additions & 0 deletions src/xhr.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@

/**
* This is a horrible hack which works in some old browsers. We can tell them to decode bytes via
* sync XHR.
*
* Throws if fails. Should be wrapped in something to check that.
*
* @param {Uint8Array} bytes
* @return {string}
*/
export function decodeSyncXHR(bytes) {
let u;

// This hack will fail in non-Edgium Edge because sync XHRs are disabled (and
// possibly in other places), so ensure there's a fallback call.
try {
const b = new Blob([bytes], { type: 'text/plain;charset=UTF-8' });
u = URL.createObjectURL(b);

const x = new XMLHttpRequest();
x.open('GET', u, false);
x.send();
return x.responseText;
} finally {
if (u) {
URL.revokeObjectURL(u);
}
}
}
14 changes: 9 additions & 5 deletions test.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,16 @@ import assert from 'node:assert';
* @param {typeof TextEncoder} TextEncoder
* @param {typeof TextDecoder} TextDecoder
*/
export function tests(isNative, TextEncoder, TextDecoder) {
export async function tests(isNative, TextEncoder, TextDecoder) {
const dec = new TextDecoder();
const enc = new TextEncoder('utf-8');

console.info('running', { isNative, TextEncoder, TextDecoder });

test(isNative ? 'native suite' : 'polyfill suite', async (c) => {
await test(isNative ? 'native suite' : 'polyfill suite', async (c) => {
const test = c.test.bind(c);

await test('really large string', () => {
await test('really large string', async () => {
const chunks = new Array(64);
for (let i = 0; i < chunks.length; ++i) {
const s = new Array(65535).fill('x'.charCodeAt(0));
Expand Down Expand Up @@ -106,6 +106,10 @@ export function tests(isNative, TextEncoder, TextDecoder) {
assert.deepEqual(dec.decode(buffer), s);
});

await test('nodejs encodings', () => {
const d = new TextDecoder('utf16le');
});

});

await test('encoder', async (c) => {
Expand Down Expand Up @@ -160,5 +164,5 @@ export function tests(isNative, TextEncoder, TextDecoder) {
}


tests(true, NativeTextEncoder, NativeTextDecoder);
tests(false, TextEncoder, TextDecoder);
await tests(true, NativeTextEncoder, NativeTextDecoder);
await tests(false, TextEncoder, TextDecoder);
Loading

0 comments on commit de7f3bc

Please sign in to comment.