Skip to content

Commit

Permalink
exported encode/decode utf8
Browse files Browse the repository at this point in the history
  • Loading branch information
samthor committed Sep 1, 2017
1 parent 94bbb7c commit 093aae9
Show file tree
Hide file tree
Showing 4 changed files with 214 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# sample data
*.txt
101 changes: 101 additions & 0 deletions js-utf.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
/**
* @param {string} string
* @param {!Array<number>|!TypedArray} target
* @param {number=} at position to write into target
* @return {number} the number of bytes written
*/
export function encode(string, target, at=0) {
const start = at;
let pos = 0;
const len = string.length;
const out = [];

while (pos < len) {
let value = string.charCodeAt(pos++);
if (value >= 0xd800 && value <= 0xdbff) {
// high surrogate
if (pos < len) {
const extra = string.charCodeAt(pos);
if ((extra & 0xfc00) === 0xdc00) {
++pos;
value = ((value & 0x3ff) << 10) + (extra & 0x3ff) + 0x10000;
}
}
if (value >= 0xd800 && value <= 0xdbff) {
continue; // drop lone surrogate
}
}

if ((value & 0xffffff80) === 0) { // 1-byte
target[at++] = value; // ASCII
continue;
} else if ((value & 0xfffff800) === 0) { // 2-byte
target[at++] = ((value >> 6) & 0x1f) | 0xc0;
} else if ((value & 0xffff0000) === 0) { // 3-byte
target[at++] = ((value >> 12) & 0x0f) | 0xe0;
target[at++] = ((value >> 6) & 0x3f) | 0x80;
} else if ((value & 0xffe00000) === 0) { // 4-byte
target[at++] = ((value >> 18) & 0x07) | 0xf0;
target[at++] = ((value >> 12) & 0x3f) | 0x80;
target[at++] = ((value >> 6) & 0x3f) | 0x80;
} else {
// FIXME: do we care
continue;
}

target[at++] = (value & 0x3f) | 0x80;
}

return at - start;
}

/**
* @param {!Array<number>|!TypedArray} bytes
* @return {string}
*/
export function decode(bytes, pos=0, len=-1) {
const start = pos;
const out = [];

if (len < 0) {
len = bytes.length;
} else {
len = Math.min(len, bytes.length);
}

while (pos < len) {
const byte1 = bytes[pos++];
if (!byte1) {
break; // NULL or null-like
}

if ((byte1 & 0x80) === 0) { // 1-byte
out.push(byte1);
} else if ((byte1 & 0xe0) === 0xc0) { // 2-byte
const byte2 = bytes[pos++] & 0x3f;
out.push(((byte1 & 0x1f) << 6) | byte2);
} else if ((byte1 & 0xf0) === 0xe0) {
const byte2 = bytes[pos++] & 0x3f;
const byte3 = bytes[pos++] & 0x3f;
out.push(((byte1 & 0x1f) << 12) | (byte2 << 6) | byte3);
} else if ((byte1 & 0xf8) === 0xf0) {
const byte2 = bytes[pos++] & 0x3f;
const byte3 = bytes[pos++] & 0x3f;
const byte4 = bytes[pos++] & 0x3f;

// this can be > 0xffff, so possibly generate surrogates
let codepoint = ((byte1 & 0x07) << 0x12) | (byte2 << 0x0c) | (byte3 << 0x06) | byte4;
if (codepoint > 0xffff) {
// codepoint &= ~0x10000;
codepoint -= 0x10000;
out.push((codepoint >>> 10) & 0x3ff | 0xd800)
codepoint = 0xdc00 | codepoint & 0x3ff;
}
out.push(codepoint);
} else {
// FIXME: we're ignoring this
}
}

return String.fromCharCode(...out);
}
10 changes: 10 additions & 0 deletions polyfill.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
window.module = {
exports: {},
};

const hack = new Proxy({}, {
get() {
return [];
},
});
window.require = () => ({'encoding-indexes': hack});
101 changes: 101 additions & 0 deletions test.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
<!DOCTYPE html>
<html>
<head>
<script defer src="https://cdn.rawgit.com/mathiasbynens/utf8.js/5566334e/utf8.js"></script>
<script type="module">
</script>
<script type="module">
import './polyfill.js';
import 'https://cdn.rawgit.com/inexorabletash/text-encoding/b98ab30b/lib/encoding.js';
const polyfill = module.exports;

import * as utf from './js-utf.js';

const runs = 100;
const dataUrl = './utf8_sequence_0-0xffff_assigned_printable.txt';

(async function() {
let text = await window.fetch(dataUrl).then((data) => data.text());
text = text.substr(0, 10000);
let saved, output;

console.time('utf8.encode');
for (let i = 0; i < runs; ++i) {
const s = utf8.encode(text);
saved = s;
}
console.timeEnd('utf8.encode');

console.time('utf8.decode');
for (let i = 0; i < runs; ++i) {
const out = utf8.decode(saved);
output = out;
}
console.timeEnd('utf8.decode');
if (output !== text) {
throw new Error('utf8 got wrong answer');
}

console.time('js-utf.encode');
const out = new Uint8Array(1024 * 1024);
for (let i = 0; i < runs; ++i) {
const len = utf.encode(text, out);
saved = out.slice(0, len);
}
console.timeEnd('js-utf.encode');
console.info(saved);
const polyfillSaved = saved;

console.time('js-utf.decode');
for (let i = 0; i < runs; ++i) {
const out = utf.decode(saved);
output = out;
}
console.timeEnd('js-utf.decode');
if (output !== text) {
console.info('expected output length', text.length, 'was', output.length);
throw new Error('js-utf got wrong answer');
}


console.time('TextEncoder');
const encoder = new TextEncoder();
for (let i = 0; i < runs; ++i) {
const out = encoder.encode(text);
saved = out;
}
console.timeEnd('TextEncoder');
console.info(saved);

console.time('TextDecoder');
const decoder = new TextDecoder();
for (let i = 0; i < runs; ++i) {
const out = decoder.decode(polyfillSaved);
output = out;
}
console.timeEnd('TextDecoder');


console.time('polyfill.TextEncoder');
const encoderP = new polyfill.TextEncoder();
for (let i = 0; i < runs; ++i) {
const out = encoderP.encode(text);
saved = out;
}
console.timeEnd('polyfill.TextEncoder');
console.info(saved);

console.time('polyfill.TextDecoder');
const decoderP = new polyfill.TextDecoder();
for (let i = 0; i < runs; ++i) {
const out = decoderP.decode(polyfillSaved);
output = out;
}
console.timeEnd('polyfill.TextDecoder');


}());

</script>
</head>
</html>

0 comments on commit 093aae9

Please sign in to comment.