forked from samthor/fast-text-encoding
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
214 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
# sample data | ||
*.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
/** | ||
* @param {string} string | ||
* @param {!Array<number>|!TypedArray} target | ||
* @param {number=} at position to write into target | ||
* @return {number} the number of bytes written | ||
*/ | ||
export function encode(string, target, at=0) { | ||
const start = at; | ||
let pos = 0; | ||
const len = string.length; | ||
const out = []; | ||
|
||
while (pos < len) { | ||
let value = string.charCodeAt(pos++); | ||
if (value >= 0xd800 && value <= 0xdbff) { | ||
// high surrogate | ||
if (pos < len) { | ||
const extra = string.charCodeAt(pos); | ||
if ((extra & 0xfc00) === 0xdc00) { | ||
++pos; | ||
value = ((value & 0x3ff) << 10) + (extra & 0x3ff) + 0x10000; | ||
} | ||
} | ||
if (value >= 0xd800 && value <= 0xdbff) { | ||
continue; // drop lone surrogate | ||
} | ||
} | ||
|
||
if ((value & 0xffffff80) === 0) { // 1-byte | ||
target[at++] = value; // ASCII | ||
continue; | ||
} else if ((value & 0xfffff800) === 0) { // 2-byte | ||
target[at++] = ((value >> 6) & 0x1f) | 0xc0; | ||
} else if ((value & 0xffff0000) === 0) { // 3-byte | ||
target[at++] = ((value >> 12) & 0x0f) | 0xe0; | ||
target[at++] = ((value >> 6) & 0x3f) | 0x80; | ||
} else if ((value & 0xffe00000) === 0) { // 4-byte | ||
target[at++] = ((value >> 18) & 0x07) | 0xf0; | ||
target[at++] = ((value >> 12) & 0x3f) | 0x80; | ||
target[at++] = ((value >> 6) & 0x3f) | 0x80; | ||
} else { | ||
// FIXME: do we care | ||
continue; | ||
} | ||
|
||
target[at++] = (value & 0x3f) | 0x80; | ||
} | ||
|
||
return at - start; | ||
} | ||
|
||
/** | ||
* @param {!Array<number>|!TypedArray} bytes | ||
* @return {string} | ||
*/ | ||
export function decode(bytes, pos=0, len=-1) { | ||
const start = pos; | ||
const out = []; | ||
|
||
if (len < 0) { | ||
len = bytes.length; | ||
} else { | ||
len = Math.min(len, bytes.length); | ||
} | ||
|
||
while (pos < len) { | ||
const byte1 = bytes[pos++]; | ||
if (!byte1) { | ||
break; // NULL or null-like | ||
} | ||
|
||
if ((byte1 & 0x80) === 0) { // 1-byte | ||
out.push(byte1); | ||
} else if ((byte1 & 0xe0) === 0xc0) { // 2-byte | ||
const byte2 = bytes[pos++] & 0x3f; | ||
out.push(((byte1 & 0x1f) << 6) | byte2); | ||
} else if ((byte1 & 0xf0) === 0xe0) { | ||
const byte2 = bytes[pos++] & 0x3f; | ||
const byte3 = bytes[pos++] & 0x3f; | ||
out.push(((byte1 & 0x1f) << 12) | (byte2 << 6) | byte3); | ||
} else if ((byte1 & 0xf8) === 0xf0) { | ||
const byte2 = bytes[pos++] & 0x3f; | ||
const byte3 = bytes[pos++] & 0x3f; | ||
const byte4 = bytes[pos++] & 0x3f; | ||
|
||
// this can be > 0xffff, so possibly generate surrogates | ||
let codepoint = ((byte1 & 0x07) << 0x12) | (byte2 << 0x0c) | (byte3 << 0x06) | byte4; | ||
if (codepoint > 0xffff) { | ||
// codepoint &= ~0x10000; | ||
codepoint -= 0x10000; | ||
out.push((codepoint >>> 10) & 0x3ff | 0xd800) | ||
codepoint = 0xdc00 | codepoint & 0x3ff; | ||
} | ||
out.push(codepoint); | ||
} else { | ||
// FIXME: we're ignoring this | ||
} | ||
} | ||
|
||
return String.fromCharCode(...out); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
window.module = { | ||
exports: {}, | ||
}; | ||
|
||
const hack = new Proxy({}, { | ||
get() { | ||
return []; | ||
}, | ||
}); | ||
window.require = () => ({'encoding-indexes': hack}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
<!DOCTYPE html> | ||
<html> | ||
<head> | ||
<script defer src="https://cdn.rawgit.com/mathiasbynens/utf8.js/5566334e/utf8.js"></script> | ||
<script type="module"> | ||
</script> | ||
<script type="module"> | ||
import './polyfill.js'; | ||
import 'https://cdn.rawgit.com/inexorabletash/text-encoding/b98ab30b/lib/encoding.js'; | ||
const polyfill = module.exports; | ||
|
||
import * as utf from './js-utf.js'; | ||
|
||
const runs = 100; | ||
const dataUrl = './utf8_sequence_0-0xffff_assigned_printable.txt'; | ||
|
||
(async function() { | ||
let text = await window.fetch(dataUrl).then((data) => data.text()); | ||
text = text.substr(0, 10000); | ||
let saved, output; | ||
|
||
console.time('utf8.encode'); | ||
for (let i = 0; i < runs; ++i) { | ||
const s = utf8.encode(text); | ||
saved = s; | ||
} | ||
console.timeEnd('utf8.encode'); | ||
|
||
console.time('utf8.decode'); | ||
for (let i = 0; i < runs; ++i) { | ||
const out = utf8.decode(saved); | ||
output = out; | ||
} | ||
console.timeEnd('utf8.decode'); | ||
if (output !== text) { | ||
throw new Error('utf8 got wrong answer'); | ||
} | ||
|
||
console.time('js-utf.encode'); | ||
const out = new Uint8Array(1024 * 1024); | ||
for (let i = 0; i < runs; ++i) { | ||
const len = utf.encode(text, out); | ||
saved = out.slice(0, len); | ||
} | ||
console.timeEnd('js-utf.encode'); | ||
console.info(saved); | ||
const polyfillSaved = saved; | ||
|
||
console.time('js-utf.decode'); | ||
for (let i = 0; i < runs; ++i) { | ||
const out = utf.decode(saved); | ||
output = out; | ||
} | ||
console.timeEnd('js-utf.decode'); | ||
if (output !== text) { | ||
console.info('expected output length', text.length, 'was', output.length); | ||
throw new Error('js-utf got wrong answer'); | ||
} | ||
|
||
|
||
console.time('TextEncoder'); | ||
const encoder = new TextEncoder(); | ||
for (let i = 0; i < runs; ++i) { | ||
const out = encoder.encode(text); | ||
saved = out; | ||
} | ||
console.timeEnd('TextEncoder'); | ||
console.info(saved); | ||
|
||
console.time('TextDecoder'); | ||
const decoder = new TextDecoder(); | ||
for (let i = 0; i < runs; ++i) { | ||
const out = decoder.decode(polyfillSaved); | ||
output = out; | ||
} | ||
console.timeEnd('TextDecoder'); | ||
|
||
|
||
console.time('polyfill.TextEncoder'); | ||
const encoderP = new polyfill.TextEncoder(); | ||
for (let i = 0; i < runs; ++i) { | ||
const out = encoderP.encode(text); | ||
saved = out; | ||
} | ||
console.timeEnd('polyfill.TextEncoder'); | ||
console.info(saved); | ||
|
||
console.time('polyfill.TextDecoder'); | ||
const decoderP = new polyfill.TextDecoder(); | ||
for (let i = 0; i < runs; ++i) { | ||
const out = decoderP.decode(polyfillSaved); | ||
output = out; | ||
} | ||
console.timeEnd('polyfill.TextDecoder'); | ||
|
||
|
||
}()); | ||
|
||
</script> | ||
</head> | ||
</html> |