Test varcharset

slevithan · Oct 28, 2024 · 7bc4aa5 · 7bc4aa5
1 parent c2c13f3
commit 7bc4aa5
Show file tree

Hide file tree

Showing 9 changed files with 94 additions and 30 deletions.
diff --git a/demo/index.html b/demo/index.html
@@ -59,7 +59,14 @@ <h2>Try it</h2>
       </p>
     </details>
     <pre id="output"></pre>
-    <p><small>The output shows the result of calling <code>toRegExp</code>. Oniguruma-To-ES includes functions to generate additional formats: <code>compile</code>, <code>toOnigurumaAst</code>, and <code>toRegexAst</code> (for an AST based on <a href="https://github.com/slevithan/regex"><code>regex</code></a>). You can run all of these from the console on this page. <code>compile</code> and <code>toRegExp</code> accept a <code>pattern</code> string, optional <code>flags</code> string, and optional <code>options</code> object. <code>toOnigurumaAst</code> and <code>toRegexAst</code> accept a <code>pattern</code> and optional <code>flags</code>. You can also pass AST results to <code>printAst</code>.</small></p>
+    <p>The output shows the result of calling <code>toRegExp</code>. Oniguruma-To-ES includes functions to generate additional formats: <code>compile</code>, <code>toOnigurumaAst</code>, and <code>toRegexAst</code> (for an AST based on <a href="https://github.com/slevithan/regex"><code>regex</code></a>). You can run all of these from the console on this page.</p>
+    <details>
+      <summary>More details</summary>
+      <ul>
+        <li><code>compile</code> and <code>toRegExp</code> accept <code>pattern: string, flags?: string, options?: object</code>.</li>
+        <li><code>toOnigurumaAst</code> and <code>toRegexAst</code> accept <code>pattern: string, flags?: string</code>.</li>
+        <li>You can pretty-print AST results by passing them to <code>printAst</code> in the console on this page.</li>
+    </details>
   </main>
 
   <script src="../dist/index.min.js"></script>

diff --git a/dist/index.min.js b/dist/index.min.js
diff --git a/spec/match-assertion.spec.js b/spec/match-assertion.spec.js
@@ -7,6 +7,8 @@ beforeEach(() => {
 });
 
 describe('Assertion', () => {
+  // For kinds `lookahead` and `lookbehind`, see `match-lookaround.spec.js`
+
   describe('line_end', () => {
     it('should match at the end of the string', () => {
       expect('ba').toMatchWithAllTargets('a$');
@@ -70,8 +72,6 @@ describe('Assertion', () => {
     });
   });
 
-  // For kinds `lookahead` and `lookbehind`, see `match-lookaround.spec.js`
-
   describe('search_start', () => {
     it('should match at the start of the search', () => {
       expect('a').toMatchWithAllTargets(r`\Ga`);

diff --git a/spec/match-character-class.spec.js → spec/match-char-class.spec.js b/spec/match-character-class.spec.js → spec/match-char-class.spec.js
diff --git a/spec/match-character-set.spec.js → spec/match-char-set.spec.js b/spec/match-character-set.spec.js → spec/match-char-set.spec.js
diff --git a/spec/match-character.spec.js → spec/match-char.spec.js b/spec/match-character.spec.js → spec/match-char.spec.js
diff --git a/spec/match-varchar-set.spec.js b/spec/match-varchar-set.spec.js
@@ -0,0 +1,51 @@
+import {r} from '../src/utils.js';
+import {matchers} from './helpers/matchers.js';
+
+beforeEach(() => {
+  jasmine.addMatchers(matchers);
+});
+
+describe('VariableLengthCharacterSet', () => {
+  describe('grapheme', () => {
+    const graphemes = [
+      '\0',
+      '\r\n',
+      '\xE9', // é
+      '\x65\u0301', // é
+      '\u{2194}\u{FE0F}', // ↔️
+      '\u{1F469}\u{1F3FF}', // 👩🏿
+    ];
+
+    it('should match any Unicode grapheme', () => {
+      for (const grapheme of graphemes) {
+        expect(grapheme).toMatchWithAllTargets(r`\A\X\z`);
+      }
+    });
+
+    it(r`should match graphemes atomically`, () => {
+      for (const grapheme of graphemes) {
+        expect(grapheme).not.toMatchWithAllTargets(r`\A\X${grapheme.at(-1)}\z`);
+      }
+    });
+  });
+
+  describe('newline', () => {
+    it('should match any line break from the accepted newline set', () => {
+      const newlines = ['\r\n', '\r', '\n', '\v', '\f', '\x85', '\u2028', '\u2029'];
+      for (const newline of newlines) {
+        expect(newline).toMatchWithAllTargets(r`\A\R\z`);
+      }
+    });
+
+    it('should not match chars outside the accepted newline set', () => {
+      const nonNewlines = ['\n\r', ' ', 't'];
+      for (const non of nonNewlines) {
+        expect(non).not.toMatchWithAllTargets(r`\A\R\z`);
+      }
+    });
+
+    it(r`should match \r\n atomically`, () => {
+      expect('\r\n').not.toMatchWithAllTargets(r`\A\R\n\z`);
+    });
+  });
+});
diff --git a/src/generate.js b/src/generate.js
@@ -192,6 +192,8 @@ const CharCodeEscapeMap = new Map([
   [11, r`\v`], // vertical tab
   [12, r`\f`], // form feed
   [13, r`\r`], // carriage return
+  [0x2028, r`\u2028`], // line separator
+  [0x2029, r`\u2029`], // paragraph separator
 ]);
 
 const casedRe = /^\p{Cased}$/u;
@@ -250,7 +252,7 @@ function genCapturingGroup({name, number, alternatives}, state, gen) {
 
 function genCharacter({value}, state) {
   const char = cp(value);
-  const escaped = getEscapedChar(value, {
+  const escaped = getCharEscape(value, {
     isAfterBackref: state.lastNode.type === AstTypes.Backreference,
     inCharClass: state.inCharClass,
     useFlagV: state.useFlagV,
@@ -302,17 +304,17 @@ function genCharacterClassRange(node, state) {
     inCharClass: true,
     useFlagV: state.useFlagV,
   };
-  const minStr = getEscapedChar(min, escOpts);
-  const maxStr = getEscapedChar(max, escOpts);
+  const minStr = getCharEscape(min, escOpts);
+  const maxStr = getCharEscape(max, escOpts);
   let extraChars = '';
   if (state.useAppliedIgnoreCase && state.currentFlags.ignoreCase) {
     // [TODO] Avoid duplication by considering other chars in the parent char class when expanding
     const charsOutsideRange = getCasesOutsideCharClassRange(node);
     const ranges = getCodePointRangesFromChars(charsOutsideRange);
     ranges.forEach(value => {
       extraChars += Array.isArray(value) ?
-        `${getEscapedChar(value[0], escOpts)}-${getEscapedChar(value[1], escOpts)}` :
-        getEscapedChar(value, escOpts);
+        `${getCharEscape(value[0], escOpts)}-${getCharEscape(value[1], escOpts)}` :
+        getCharEscape(value, escOpts);
     });
   }
   // Create the range without calling `gen` on the `min`/`max` kids
@@ -429,36 +431,23 @@ function getCasesOutsideCharClassRange(node, {firstOnly} = {}) {
   return found;
 }
 
-function getCodePointRangesFromChars(chars) {
-  const codePoints = chars.map(char => char.codePointAt(0)).sort((a, b) => a - b);
-  const values = [];
-  let start = null;
-  for (let i = 0; i < codePoints.length; i++) {
-    if (codePoints[i + 1] === codePoints[i] + 1) {
-      start ??= codePoints[i];
-    } else if (start === null) {
-      values.push(codePoints[i]);
-    } else {
-      values.push([start, codePoints[i]]);
-      start = null;
-    }
-  }
-  return values;
-}
-
 // This shouldn't modifiy any char that has case
-function getEscapedChar(codePoint, {isAfterBackref, inCharClass, useFlagV}) {
+function getCharEscape(codePoint, {isAfterBackref, inCharClass, useFlagV}) {
   if (CharCodeEscapeMap.has(codePoint)) {
     return CharCodeEscapeMap.get(codePoint);
   }
   if (
     // Control chars, etc.; condition modeled on the Chrome developer console's display for strings
     codePoint < 32 || (codePoint > 126 && codePoint < 160) ||
+    // Unicode planes 4-16; unassigned, special purpose, and private use area
+    codePoint > 0x3FFFF ||
     // Avoid corrupting a preceding backref by immediately following it with a literal digit
     (isAfterBackref && isDigitCharCode(codePoint))
   ) {
     // Don't convert codePoint `0` to `\0` since that's corruptible by following literal digits
-    return r`\x${codePoint.toString(16).padStart(2, '0')}`;
+    return codePoint > 0xFF ?
+      r`\u{${codePoint.toString(16).toUpperCase()}}` :
+      r`\x${codePoint.toString(16).toUpperCase().padStart(2, '0')}`;
   }
   const escapeChars = inCharClass ?
     (useFlagV ? CharClassEscapeCharsFlagV : CharClassEscapeChars) :
@@ -467,6 +456,23 @@ function getEscapedChar(codePoint, {isAfterBackref, inCharClass, useFlagV}) {
   return (escapeChars.has(char) ? '\\' : '') + char;
 }
 
+function getCodePointRangesFromChars(chars) {
+  const codePoints = chars.map(char => char.codePointAt(0)).sort((a, b) => a - b);
+  const values = [];
+  let start = null;
+  for (let i = 0; i < codePoints.length; i++) {
+    if (codePoints[i + 1] === codePoints[i] + 1) {
+      start ??= codePoints[i];
+    } else if (start === null) {
+      values.push(codePoints[i]);
+    } else {
+      values.push([start, codePoints[i]]);
+      start = null;
+    }
+  }
+  return values;
+}
+
 function getGroupPrefix(atomic, flagMods, useFlagMods) {
   if (atomic) {
     return '>';

diff --git a/src/parse.js b/src/parse.js
@@ -57,8 +57,8 @@ const AstCharacterSetKinds = TokenCharacterSetKinds;
 const AstDirectiveKinds = TokenDirectiveKinds;
 
 const AstVariableLengthCharacterSetKinds = {
-  newline: 'newline',
   grapheme: 'grapheme',
+  newline: 'newline',
 };
 
 /**