forked from samthor/fast-text-encoding
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
13 changed files
with
951 additions
and
23 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
|
||
/** | ||
* @param {Uint8Array} bytes | ||
* @param {string} encoding | ||
* @return {string} | ||
*/ | ||
export function decodeBuffer(bytes, encoding) { | ||
/** @type {Buffer} */ | ||
let b; | ||
if (bytes instanceof Buffer) { | ||
b = bytes; | ||
} else { | ||
b = Buffer.from(bytes.buffer, bytes.byteOffset, bytes.byteLength); | ||
} | ||
return b.toString(/** @type {BufferEncoding} */(encoding)); | ||
} | ||
|
||
|
||
/** | ||
* @param {string} string | ||
* @return {Uint8Array} | ||
*/ | ||
export const encodeBuffer = (string) => Buffer.from(string); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,141 @@ | ||
|
||
/** | ||
* @param {Uint8Array} bytes | ||
* @return {string} | ||
*/ | ||
export function decodeFallback(bytes) { | ||
let inputIndex = 0; | ||
|
||
// Create a working buffer for UTF-16 code points, but don't generate one | ||
// which is too large for small input sizes. UTF-8 to UCS-16 conversion is | ||
// going to be at most 1:1, if all code points are ASCII. The other extreme | ||
// is 4-byte UTF-8, which results in two UCS-16 points, but this is still 50% | ||
// fewer entries in the output. | ||
const pendingSize = Math.min(256 * 256, bytes.length + 1); | ||
const pending = new Uint16Array(pendingSize); | ||
const chunks = []; | ||
let pendingIndex = 0; | ||
|
||
for (; ;) { | ||
const more = inputIndex < bytes.length; | ||
|
||
// If there's no more data or there'd be no room for two UTF-16 values, | ||
// create a chunk. This isn't done at the end by simply slicing the data | ||
// into equal sized chunks as we might hit a surrogate pair. | ||
if (!more || (pendingIndex >= pendingSize - 1)) { | ||
// nb. .apply and friends are *really slow*. Low-hanging fruit is to | ||
// expand this to literally pass pending[0], pending[1], ... etc, but | ||
// the output code expands pretty fast in this case. | ||
// These extra vars get compiled out: they're just to make TS happy. | ||
// Turns out you can pass an ArrayLike to .apply(). | ||
const subarray = pending.subarray(0, pendingIndex); | ||
const arraylike = /** @type {number[]} */ (/** @type {unknown} */ (subarray)); | ||
chunks.push(String.fromCharCode.apply(null, arraylike)); | ||
|
||
if (!more) { | ||
return chunks.join(''); | ||
} | ||
|
||
// Move the buffer forward and create another chunk. | ||
bytes = bytes.subarray(inputIndex); | ||
inputIndex = 0; | ||
pendingIndex = 0; | ||
} | ||
|
||
// The native TextDecoder will generate "REPLACEMENT CHARACTER" where the | ||
// input data is invalid. Here, we blindly parse the data even if it's | ||
// wrong: e.g., if a 3-byte sequence doesn't have two valid continuations. | ||
|
||
const byte1 = bytes[inputIndex++]; | ||
if ((byte1 & 0x80) === 0) { // 1-byte or null | ||
pending[pendingIndex++] = byte1; | ||
} else if ((byte1 & 0xe0) === 0xc0) { // 2-byte | ||
const byte2 = bytes[inputIndex++] & 0x3f; | ||
pending[pendingIndex++] = ((byte1 & 0x1f) << 6) | byte2; | ||
} else if ((byte1 & 0xf0) === 0xe0) { // 3-byte | ||
const byte2 = bytes[inputIndex++] & 0x3f; | ||
const byte3 = bytes[inputIndex++] & 0x3f; | ||
pending[pendingIndex++] = ((byte1 & 0x1f) << 12) | (byte2 << 6) | byte3; | ||
} else if ((byte1 & 0xf8) === 0xf0) { // 4-byte | ||
const byte2 = bytes[inputIndex++] & 0x3f; | ||
const byte3 = bytes[inputIndex++] & 0x3f; | ||
const byte4 = bytes[inputIndex++] & 0x3f; | ||
|
||
// this can be > 0xffff, so possibly generate surrogates | ||
let codepoint = ((byte1 & 0x07) << 0x12) | (byte2 << 0x0c) | (byte3 << 0x06) | byte4; | ||
if (codepoint > 0xffff) { | ||
// codepoint &= ~0x10000; | ||
codepoint -= 0x10000; | ||
pending[pendingIndex++] = (codepoint >>> 10) & 0x3ff | 0xd800; | ||
codepoint = 0xdc00 | codepoint & 0x3ff; | ||
} | ||
pending[pendingIndex++] = codepoint; | ||
} else { | ||
// invalid initial byte | ||
} | ||
} | ||
} | ||
|
||
|
||
/** | ||
* @param {string} string | ||
* @return {Uint8Array} | ||
*/ | ||
export function encodeFallback(string) { | ||
let pos = 0; | ||
const len = string.length; | ||
|
||
let at = 0; // output position | ||
let tlen = Math.max(32, len + (len >>> 1) + 7); // 1.5x size | ||
let target = new Uint8Array((tlen >>> 3) << 3); // ... but at 8 byte offset | ||
|
||
while (pos < len) { | ||
let value = string.charCodeAt(pos++); | ||
if (value >= 0xd800 && value <= 0xdbff) { | ||
// high surrogate | ||
if (pos < len) { | ||
const extra = string.charCodeAt(pos); | ||
if ((extra & 0xfc00) === 0xdc00) { | ||
++pos; | ||
value = ((value & 0x3ff) << 10) + (extra & 0x3ff) + 0x10000; | ||
} | ||
} | ||
if (value >= 0xd800 && value <= 0xdbff) { | ||
continue; // drop lone surrogate | ||
} | ||
} | ||
|
||
// expand the buffer if we couldn't write 4 bytes | ||
if (at + 4 > target.length) { | ||
tlen += 8; // minimum extra | ||
tlen *= (1.0 + (pos / string.length) * 2); // take 2x the remaining | ||
tlen = (tlen >>> 3) << 3; // 8 byte offset | ||
|
||
const update = new Uint8Array(tlen); | ||
update.set(target); | ||
target = update; | ||
} | ||
|
||
if ((value & 0xffffff80) === 0) { // 1-byte | ||
target[at++] = value; // ASCII | ||
continue; | ||
} else if ((value & 0xfffff800) === 0) { // 2-byte | ||
target[at++] = ((value >>> 6) & 0x1f) | 0xc0; | ||
} else if ((value & 0xffff0000) === 0) { // 3-byte | ||
target[at++] = ((value >>> 12) & 0x0f) | 0xe0; | ||
target[at++] = ((value >>> 6) & 0x3f) | 0x80; | ||
} else if ((value & 0xffe00000) === 0) { // 4-byte | ||
target[at++] = ((value >>> 18) & 0x07) | 0xf0; | ||
target[at++] = ((value >>> 12) & 0x3f) | 0x80; | ||
target[at++] = ((value >>> 6) & 0x3f) | 0x80; | ||
} else { | ||
continue; // out of range | ||
} | ||
|
||
target[at++] = (value & 0x3f) | 0x80; | ||
} | ||
|
||
// Use subarray if slice isn't supported (IE11). This will use more memory | ||
// because the original array still exists. | ||
return target.slice ? target.slice(0, at) : target.subarray(0, at); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
import { decodeBuffer } from './buffer.js'; | ||
import { decodeFallback } from './lowlevel.js'; | ||
import { failedToString, maybeThrowFailedToOption } from './shared.js'; | ||
import { hasBufferFrom } from './support.js'; | ||
import { decodeSyncXHR } from './xhr.js'; | ||
|
||
const trySyncXHR = !hasBufferFrom && (typeof Blob === 'function' && typeof URL === 'function' && typeof URL.createObjectURL === 'function'); | ||
const validUtfLabels = ['utf-8', 'utf8', 'unicode-1-1-utf-8']; | ||
|
||
/** @type {(bytes: Uint8Array, encoding: string) => string} */ | ||
let decodeImpl = decodeFallback; | ||
if (hasBufferFrom) { | ||
decodeImpl = decodeBuffer; | ||
} else if (trySyncXHR) { | ||
decodeImpl = (string) => { | ||
try { | ||
return decodeSyncXHR(string); | ||
} catch (e) { | ||
return decodeFallback(string); | ||
} | ||
}; | ||
} | ||
|
||
|
||
const errorPrefix = `${failedToString} construct 'TextDecoder': the `; | ||
|
||
|
||
/** | ||
* @constructor | ||
* @param {string=} utfLabel | ||
* @param {{fatal: boolean}=} options | ||
*/ | ||
export function FastTextDecoder(utfLabel = 'utf-8', options) { | ||
maybeThrowFailedToOption(options && options.fatal, `construct 'TextDecoder'`, 'fatal'); | ||
|
||
/** @type {boolean} */ | ||
let ok; | ||
if (hasBufferFrom) { | ||
ok = Buffer.isEncoding(utfLabel); | ||
} else { | ||
ok = validUtfLabels.indexOf(utfLabel.toLowerCase()) !== -1; | ||
} | ||
if (!ok) { | ||
throw new RangeError(`${errorPrefix} encoding label provided ('${utfLabel}') is invalid.`); | ||
} | ||
|
||
this.encoding = utfLabel; | ||
this.fatal = false; | ||
this.ignoreBOM = false; | ||
} | ||
|
||
/** | ||
* @param {(ArrayBuffer|ArrayBufferView)} buffer | ||
* @param {{stream: boolean}=} options | ||
* @return {string} | ||
*/ | ||
FastTextDecoder.prototype.decode = function (buffer, options) { | ||
maybeThrowFailedToOption(options && options.stream, 'decode', 'stream'); | ||
|
||
let bytes; | ||
|
||
if (buffer instanceof Uint8Array) { | ||
// Accept Uint8Array instances as-is. This is also a Node buffer. | ||
bytes = buffer; | ||
} else if (buffer['buffer'] instanceof ArrayBuffer) { | ||
// Look for ArrayBufferView, which isn't a real type, but basically | ||
// represents all the valid TypedArray types plus DataView. They all have | ||
// ".buffer" as an instance of ArrayBuffer. | ||
bytes = new Uint8Array(/** @type {ArrayBufferView} */(buffer).buffer); | ||
} else { | ||
// The only other valid argument here is that "buffer" is an ArrayBuffer. | ||
// We also try to convert anything else passed to a Uint8Array, as this | ||
// catches anything that's array-like. Native code would throw here. | ||
bytes = new Uint8Array(/** @type {any} */(buffer)); | ||
} | ||
|
||
return decodeImpl(bytes, this.encoding); | ||
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
import { encodeBuffer } from './buffer.js'; | ||
import { encodeFallback } from './lowlevel.js'; | ||
import { maybeThrowFailedToOption } from './shared.js'; | ||
import { hasBufferFrom } from './support.js'; | ||
|
||
export const encodeImpl = hasBufferFrom ? encodeFallback : encodeBuffer; | ||
|
||
/** | ||
* @constructor | ||
*/ | ||
export function FastTextEncoder() { | ||
// This does not accept an encoding, and always uses UTF-8: | ||
// https://www.w3.org/TR/encoding/#dom-textencoder | ||
this.encoding = 'utf-8'; | ||
} | ||
|
||
/** | ||
* @param {string} string | ||
* @param {{stream: boolean}=} options | ||
* @return {Uint8Array} | ||
*/ | ||
FastTextEncoder.prototype.encode = function (string, options) { | ||
maybeThrowFailedToOption(options && options.stream, 'encode', 'stream'); | ||
return encodeImpl(string); | ||
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
|
||
import { FastTextEncoder } from './o-encoder.js'; | ||
import { FastTextDecoder } from './o-decoder.js'; | ||
|
||
/** @type {object} */ | ||
const scope = typeof window !== 'undefined' ? window : (typeof global !== 'undefined' ? global : this); | ||
|
||
scope['TextEncoder'] = scope['TextEncoder'] || FastTextEncoder; | ||
scope['TextDecoder'] = scope['TextDecoder'] || FastTextDecoder; | ||
|
||
export {}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
|
||
export const failedToString = 'Failed to '; | ||
|
||
/** | ||
* @param {boolean|undefined} check | ||
* @param {string} operation | ||
* @param {string} fieldName | ||
*/ | ||
export const maybeThrowFailedToOption = (check, operation, fieldName) => { | ||
if (check) { | ||
throw new Error(`${failedToString}${operation}: the '${fieldName}' option is unsupported.`); | ||
} | ||
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
|
||
export const hasBufferFrom = (typeof Buffer === 'function' && Buffer.from); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
|
||
/** | ||
* This is a horrible hack which works in some old browsers. We can tell them to decode bytes via | ||
* sync XHR. | ||
* | ||
* Throws if fails. Should be wrapped in something to check that. | ||
* | ||
* @param {Uint8Array} bytes | ||
* @return {string} | ||
*/ | ||
export function decodeSyncXHR(bytes) { | ||
let u; | ||
|
||
// This hack will fail in non-Edgium Edge because sync XHRs are disabled (and | ||
// possibly in other places), so ensure there's a fallback call. | ||
try { | ||
const b = new Blob([bytes], { type: 'text/plain;charset=UTF-8' }); | ||
u = URL.createObjectURL(b); | ||
|
||
const x = new XMLHttpRequest(); | ||
x.open('GET', u, false); | ||
x.send(); | ||
return x.responseText; | ||
} finally { | ||
if (u) { | ||
URL.revokeObjectURL(u); | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.