exported encode/decode utf8

tetherto · Sep 1, 2017 · 093aae9 · 093aae9
1 parent 94bbb7c
commit 093aae9
Show file tree

Hide file tree

Showing 4 changed files with 214 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+# sample data
+*.txt
diff --git a/js-utf.js b/js-utf.js
@@ -0,0 +1,101 @@
+/**
+ * @param {string} string
+ * @param {!Array<number>|!TypedArray} target
+ * @param {number=} at position to write into target
+ * @return {number} the number of bytes written
+ */
+export function encode(string, target, at=0) {
+  const start = at;
+  let pos = 0;
+  const len = string.length;
+  const out = [];
+
+  while (pos < len) {
+    let value = string.charCodeAt(pos++);
+    if (value >= 0xd800 && value <= 0xdbff) {
+      // high surrogate
+      if (pos < len) {
+        const extra = string.charCodeAt(pos);
+        if ((extra & 0xfc00) === 0xdc00) {
+          ++pos;
+          value = ((value & 0x3ff) << 10) + (extra & 0x3ff) + 0x10000;
+        }
+      }
+      if (value >= 0xd800 && value <= 0xdbff) {
+        continue;  // drop lone surrogate
+      }
+    }
+
+    if ((value & 0xffffff80) === 0) {  // 1-byte
+      target[at++] = value;  // ASCII
+      continue;
+    } else if ((value & 0xfffff800) === 0) {  // 2-byte
+      target[at++] = ((value >>  6) & 0x1f) | 0xc0;
+    } else if ((value & 0xffff0000) === 0) {  // 3-byte
+      target[at++] = ((value >> 12) & 0x0f) | 0xe0;
+      target[at++] = ((value >>  6) & 0x3f) | 0x80;
+    } else if ((value & 0xffe00000) === 0) {  // 4-byte
+      target[at++] = ((value >> 18) & 0x07) | 0xf0;
+      target[at++] = ((value >> 12) & 0x3f) | 0x80;
+      target[at++] = ((value >>  6) & 0x3f) | 0x80;
+    } else { 
+      // FIXME: do we care
+      continue;
+    }
+
+    target[at++] = (value & 0x3f) | 0x80;
+  }
+
+  return at - start;
+}
+
+/**
+ * @param {!Array<number>|!TypedArray} bytes
+ * @return {string}
+ */
+export function decode(bytes, pos=0, len=-1) {
+  const start = pos;
+  const out = [];
+
+  if (len < 0) {
+    len = bytes.length;
+  } else {
+    len = Math.min(len, bytes.length);
+  }
+
+  while (pos < len) {
+    const byte1 = bytes[pos++];
+    if (!byte1) {
+      break;  // NULL or null-like
+    }
+
+    if ((byte1 & 0x80) === 0) {  // 1-byte
+      out.push(byte1);
+    } else if ((byte1 & 0xe0) === 0xc0) {  // 2-byte
+      const byte2 = bytes[pos++] & 0x3f;
+      out.push(((byte1 & 0x1f) << 6) | byte2);
+    } else if ((byte1 & 0xf0) === 0xe0) {
+      const byte2 = bytes[pos++] & 0x3f;
+      const byte3 = bytes[pos++] & 0x3f;
+      out.push(((byte1 & 0x1f) << 12) | (byte2 << 6) | byte3);
+    } else if ((byte1 & 0xf8) === 0xf0) {
+      const byte2 = bytes[pos++] & 0x3f;
+      const byte3 = bytes[pos++] & 0x3f;
+      const byte4 = bytes[pos++] & 0x3f;
+
+      // this can be > 0xffff, so possibly generate surrogates
+      let codepoint = ((byte1 & 0x07) << 0x12) | (byte2 << 0x0c) | (byte3 << 0x06) | byte4;
+      if (codepoint > 0xffff) {
+        // codepoint &= ~0x10000;
+        codepoint -= 0x10000;
+        out.push((codepoint >>> 10) & 0x3ff | 0xd800)
+        codepoint = 0xdc00 | codepoint & 0x3ff;
+      }
+      out.push(codepoint);
+    } else {
+      // FIXME: we're ignoring this
+    }
+  }
+
+  return String.fromCharCode(...out);
+}
diff --git a/polyfill.js b/polyfill.js
@@ -0,0 +1,10 @@
+window.module = {
+  exports: {},
+};
+
+const hack = new Proxy({}, {
+  get() {
+    return [];
+  },
+});
+window.require = () => ({'encoding-indexes': hack});
diff --git a/test.html b/test.html
@@ -0,0 +1,101 @@
+<!DOCTYPE html>
+<html>
+<head>
+<script defer src="https://cdn.rawgit.com/mathiasbynens/utf8.js/5566334e/utf8.js"></script>
+<script type="module">
+</script>
+<script type="module">
+  import './polyfill.js';
+  import 'https://cdn.rawgit.com/inexorabletash/text-encoding/b98ab30b/lib/encoding.js';
+  const polyfill = module.exports;
+
+  import * as utf from './js-utf.js';
+
+  const runs = 100;
+  const dataUrl = './utf8_sequence_0-0xffff_assigned_printable.txt';
+
+  (async function() {
+    let text = await window.fetch(dataUrl).then((data) => data.text());
+    text = text.substr(0, 10000);
+    let saved, output;
+
+    console.time('utf8.encode');
+    for (let i = 0; i < runs; ++i) {
+      const s = utf8.encode(text);
+      saved = s;
+    }
+    console.timeEnd('utf8.encode');
+
+    console.time('utf8.decode');
+    for (let i = 0; i < runs; ++i) {
+      const out = utf8.decode(saved);
+      output = out;
+    }
+    console.timeEnd('utf8.decode');
+    if (output !== text) {
+      throw new Error('utf8 got wrong answer');
+    }
+
+    console.time('js-utf.encode');
+    const out = new Uint8Array(1024 * 1024);
+    for (let i = 0; i < runs; ++i) {
+      const len = utf.encode(text, out);
+      saved = out.slice(0, len);
+    }
+    console.timeEnd('js-utf.encode');
+    console.info(saved);
+    const polyfillSaved = saved;
+
+    console.time('js-utf.decode');
+    for (let i = 0; i < runs; ++i) {
+      const out = utf.decode(saved);
+      output = out;
+    }
+    console.timeEnd('js-utf.decode');
+    if (output !== text) {
+      console.info('expected output length', text.length, 'was', output.length);
+      throw new Error('js-utf got wrong answer');
+    }
+
+
+    console.time('TextEncoder');
+    const encoder = new TextEncoder();
+    for (let i = 0; i < runs; ++i) {
+      const out = encoder.encode(text);
+      saved = out;
+    }
+    console.timeEnd('TextEncoder');
+    console.info(saved);
+
+    console.time('TextDecoder');
+    const decoder = new TextDecoder();
+    for (let i = 0; i < runs; ++i) {
+      const out = decoder.decode(polyfillSaved);
+      output = out;
+    }
+    console.timeEnd('TextDecoder');
+
+
+    console.time('polyfill.TextEncoder');
+    const encoderP = new polyfill.TextEncoder();
+    for (let i = 0; i < runs; ++i) {
+      const out = encoderP.encode(text);
+      saved = out;
+    }
+    console.timeEnd('polyfill.TextEncoder');
+    console.info(saved);
+
+    console.time('polyfill.TextDecoder');
+    const decoderP = new polyfill.TextDecoder();
+    for (let i = 0; i < runs; ++i) {
+      const out = decoderP.decode(polyfillSaved);
+      output = out;
+    }
+    console.timeEnd('polyfill.TextDecoder');
+
+
+  }());
+
+</script>
+</head>
+</html>