Skip to content

Commit

Permalink
Test varcharset
Browse files Browse the repository at this point in the history
  • Loading branch information
slevithan committed Oct 28, 2024
1 parent c2c13f3 commit 7bc4aa5
Show file tree
Hide file tree
Showing 9 changed files with 94 additions and 30 deletions.
9 changes: 8 additions & 1 deletion demo/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,14 @@ <h2>Try it</h2>
</p>
</details>
<pre id="output"></pre>
<p><small>The output shows the result of calling <code>toRegExp</code>. Oniguruma-To-ES includes functions to generate additional formats: <code>compile</code>, <code>toOnigurumaAst</code>, and <code>toRegexAst</code> (for an AST based on <a href="https://github.com/slevithan/regex"><code>regex</code></a>). You can run all of these from the console on this page. <code>compile</code> and <code>toRegExp</code> accept a <code>pattern</code> string, optional <code>flags</code> string, and optional <code>options</code> object. <code>toOnigurumaAst</code> and <code>toRegexAst</code> accept a <code>pattern</code> and optional <code>flags</code>. You can also pass AST results to <code>printAst</code>.</small></p>
<p>The output shows the result of calling <code>toRegExp</code>. Oniguruma-To-ES includes functions to generate additional formats: <code>compile</code>, <code>toOnigurumaAst</code>, and <code>toRegexAst</code> (for an AST based on <a href="https://github.com/slevithan/regex"><code>regex</code></a>). You can run all of these from the console on this page.</p>
<details>
<summary>More details</summary>
<ul>
<li><code>compile</code> and <code>toRegExp</code> accept <code>pattern: string, flags?: string, options?: object</code>.</li>
<li><code>toOnigurumaAst</code> and <code>toRegexAst</code> accept <code>pattern: string, flags?: string</code>.</li>
<li>You can pretty-print AST results by passing them to <code>printAst</code> in the console on this page.</li>
</details>
</main>

<script src="../dist/index.min.js"></script>
Expand Down
4 changes: 2 additions & 2 deletions dist/index.min.js

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions spec/match-assertion.spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ beforeEach(() => {
});

describe('Assertion', () => {
// For kinds `lookahead` and `lookbehind`, see `match-lookaround.spec.js`

describe('line_end', () => {
it('should match at the end of the string', () => {
expect('ba').toMatchWithAllTargets('a$');
Expand Down Expand Up @@ -70,8 +72,6 @@ describe('Assertion', () => {
});
});

// For kinds `lookahead` and `lookbehind`, see `match-lookaround.spec.js`

describe('search_start', () => {
it('should match at the start of the search', () => {
expect('a').toMatchWithAllTargets(r`\Ga`);
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
51 changes: 51 additions & 0 deletions spec/match-varchar-set.spec.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import {r} from '../src/utils.js';
import {matchers} from './helpers/matchers.js';

beforeEach(() => {
jasmine.addMatchers(matchers);
});

describe('VariableLengthCharacterSet', () => {
describe('grapheme', () => {
const graphemes = [
'\0',
'\r\n',
'\xE9', // é
'\x65\u0301', // é
'\u{2194}\u{FE0F}', // ↔️
'\u{1F469}\u{1F3FF}', // 👩🏿
];

it('should match any Unicode grapheme', () => {
for (const grapheme of graphemes) {
expect(grapheme).toMatchWithAllTargets(r`\A\X\z`);
}
});

it(r`should match graphemes atomically`, () => {
for (const grapheme of graphemes) {
expect(grapheme).not.toMatchWithAllTargets(r`\A\X${grapheme.at(-1)}\z`);
}
});
});

describe('newline', () => {
it('should match any line break from the accepted newline set', () => {
const newlines = ['\r\n', '\r', '\n', '\v', '\f', '\x85', '\u2028', '\u2029'];
for (const newline of newlines) {
expect(newline).toMatchWithAllTargets(r`\A\R\z`);
}
});

it('should not match chars outside the accepted newline set', () => {
const nonNewlines = ['\n\r', ' ', 't'];
for (const non of nonNewlines) {
expect(non).not.toMatchWithAllTargets(r`\A\R\z`);
}
});

it(r`should match \r\n atomically`, () => {
expect('\r\n').not.toMatchWithAllTargets(r`\A\R\n\z`);
});
});
});
54 changes: 30 additions & 24 deletions src/generate.js
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,8 @@ const CharCodeEscapeMap = new Map([
[11, r`\v`], // vertical tab
[12, r`\f`], // form feed
[13, r`\r`], // carriage return
[0x2028, r`\u2028`], // line separator
[0x2029, r`\u2029`], // paragraph separator
]);

const casedRe = /^\p{Cased}$/u;
Expand Down Expand Up @@ -250,7 +252,7 @@ function genCapturingGroup({name, number, alternatives}, state, gen) {

function genCharacter({value}, state) {
const char = cp(value);
const escaped = getEscapedChar(value, {
const escaped = getCharEscape(value, {
isAfterBackref: state.lastNode.type === AstTypes.Backreference,
inCharClass: state.inCharClass,
useFlagV: state.useFlagV,
Expand Down Expand Up @@ -302,17 +304,17 @@ function genCharacterClassRange(node, state) {
inCharClass: true,
useFlagV: state.useFlagV,
};
const minStr = getEscapedChar(min, escOpts);
const maxStr = getEscapedChar(max, escOpts);
const minStr = getCharEscape(min, escOpts);
const maxStr = getCharEscape(max, escOpts);
let extraChars = '';
if (state.useAppliedIgnoreCase && state.currentFlags.ignoreCase) {
// [TODO] Avoid duplication by considering other chars in the parent char class when expanding
const charsOutsideRange = getCasesOutsideCharClassRange(node);
const ranges = getCodePointRangesFromChars(charsOutsideRange);
ranges.forEach(value => {
extraChars += Array.isArray(value) ?
`${getEscapedChar(value[0], escOpts)}-${getEscapedChar(value[1], escOpts)}` :
getEscapedChar(value, escOpts);
`${getCharEscape(value[0], escOpts)}-${getCharEscape(value[1], escOpts)}` :
getCharEscape(value, escOpts);
});
}
// Create the range without calling `gen` on the `min`/`max` kids
Expand Down Expand Up @@ -429,36 +431,23 @@ function getCasesOutsideCharClassRange(node, {firstOnly} = {}) {
return found;
}

function getCodePointRangesFromChars(chars) {
const codePoints = chars.map(char => char.codePointAt(0)).sort((a, b) => a - b);
const values = [];
let start = null;
for (let i = 0; i < codePoints.length; i++) {
if (codePoints[i + 1] === codePoints[i] + 1) {
start ??= codePoints[i];
} else if (start === null) {
values.push(codePoints[i]);
} else {
values.push([start, codePoints[i]]);
start = null;
}
}
return values;
}

// This shouldn't modifiy any char that has case
function getEscapedChar(codePoint, {isAfterBackref, inCharClass, useFlagV}) {
function getCharEscape(codePoint, {isAfterBackref, inCharClass, useFlagV}) {
if (CharCodeEscapeMap.has(codePoint)) {
return CharCodeEscapeMap.get(codePoint);
}
if (
// Control chars, etc.; condition modeled on the Chrome developer console's display for strings
codePoint < 32 || (codePoint > 126 && codePoint < 160) ||
// Unicode planes 4-16; unassigned, special purpose, and private use area
codePoint > 0x3FFFF ||
// Avoid corrupting a preceding backref by immediately following it with a literal digit
(isAfterBackref && isDigitCharCode(codePoint))
) {
// Don't convert codePoint `0` to `\0` since that's corruptible by following literal digits
return r`\x${codePoint.toString(16).padStart(2, '0')}`;
return codePoint > 0xFF ?
r`\u{${codePoint.toString(16).toUpperCase()}}` :
r`\x${codePoint.toString(16).toUpperCase().padStart(2, '0')}`;
}
const escapeChars = inCharClass ?
(useFlagV ? CharClassEscapeCharsFlagV : CharClassEscapeChars) :
Expand All @@ -467,6 +456,23 @@ function getEscapedChar(codePoint, {isAfterBackref, inCharClass, useFlagV}) {
return (escapeChars.has(char) ? '\\' : '') + char;
}

function getCodePointRangesFromChars(chars) {
const codePoints = chars.map(char => char.codePointAt(0)).sort((a, b) => a - b);
const values = [];
let start = null;
for (let i = 0; i < codePoints.length; i++) {
if (codePoints[i + 1] === codePoints[i] + 1) {
start ??= codePoints[i];
} else if (start === null) {
values.push(codePoints[i]);
} else {
values.push([start, codePoints[i]]);
start = null;
}
}
return values;
}

function getGroupPrefix(atomic, flagMods, useFlagMods) {
if (atomic) {
return '>';
Expand Down
2 changes: 1 addition & 1 deletion src/parse.js
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,8 @@ const AstCharacterSetKinds = TokenCharacterSetKinds;
const AstDirectiveKinds = TokenDirectiveKinds;

const AstVariableLengthCharacterSetKinds = {
newline: 'newline',
grapheme: 'grapheme',
newline: 'newline',
};

/**
Expand Down

0 comments on commit 7bc4aa5

Please sign in to comment.