From e3bf3f0f80b7ea1ef2b0dbc80bb0913864eb55d7 Mon Sep 17 00:00:00 2001 From: Steven Levithan Date: Tue, 21 Jan 2025 04:32:05 +0100 Subject: [PATCH] Support absent repeaters (#13) --- README.md | 21 +++++++++++++++------ src/index.js | 4 ++-- src/parse.js | 38 +++++++++++++++++++++++++++++++++----- src/subclass.js | 4 ++-- src/tokenize.js | 8 ++++---- src/transform.js | 16 +++++++++++++--- src/traverse.js | 1 + 7 files changed, 70 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index d5a2e2c..53a2c92 100644 --- a/README.md +++ b/README.md @@ -908,7 +908,7 @@ Notice that nearly every feature below has at least subtle differences from Java - Other + Other Comment group (?#…) ✅ @@ -928,6 +928,15 @@ Notice that nearly every feature below has at least subtle differences from Java ✔ Same as JS
+ + Absent repeater + (?~…) + ✅ + ✅ + + ✔ Supported[6]
+ + Keep \K @@ -985,6 +994,7 @@ The table above doesn't include all aspects that Oniguruma-To-ES emulates (inclu 3. Target `ES2018` doesn't support nested *negated* character classes. 4. It's not an error for *numbered* backreferences to come before their referenced group in Oniguruma, but an error is the best path for Oniguruma-To-ES because ① most placements are mistakes and can never match (based on the Oniguruma behavior for backreferences to nonparticipating groups), ② erroring matches the behavior of named backreferences, and ③ the edge cases where they're matchable rely on rules for backreference resetting within quantified groups that are different in JavaScript and aren't emulatable. Note that it's not a backreference in the first place if using `\10` or higher and not as many capturing groups are defined to the left (it's an octal or identity escape). 5. Oniguruma's recursion depth limit is `20`. Oniguruma-To-ES uses the same limit by default but allows customizing it via the `rules.recursionLimit` option. Two rare uses of recursion aren't yet supported: overlapping recursions, and use of backreferences when a recursed subpattern contains captures. Patterns that would trigger an infinite recursion error in Oniguruma might find a match in Oniguruma-To-ES (since recursion is bounded), but future versions will detect this and error at transpilation time. +6. Exotic (and extremely rare) forms of absent functions that start with `(?~|` (absent expressions, stoppers, and clearers) aren't yet supported. ## ❌ Unsupported features @@ -996,19 +1006,18 @@ The following throw errors since they aren't yet supported. They're all extremel - Grapheme boundaries: `\y`, `\Y`. - Flags `P` (POSIX is ASCII) and `y{g}`/`y{w}` (grapheme boundary modes). - Whole-pattern modifier: Don't capture group `(?C)`. - - Callout: `(*FAIL)`. + - Named callout: `(*FAIL)`. - Supportable for some uses: - - Absence functions: `(?~…)`, etc. - Conditionals: `(?(…)…)`, etc. - Whole-pattern modifiers: Ignore-case is ASCII `(?I)`, find longest `(?L)`. - - Callout pair: `(*SKIP)(*FAIL)`. + - Named callout pair: `(*SKIP)(*FAIL)`. - Not supportable: - Other callouts: `(?{…})`, `(*…)`, etc. -Note that Oniguruma-To-ES supports 99.9+% of real-world Oniguruma regexes, based on a sample of tens of thousands of regexes used in TextMate grammars. Of the features listed above, absence functions and conditionals were used in 2–3 regexes each. The rest weren't used at all. - See also the [supported features](#-supported-features) table (above) which describes some additional rarely-used sub-features that aren't currently supported. +Note that Oniguruma-To-ES supports 99.9+% of real-world Oniguruma regexes, based on a sample of tens of thousands of regexes used in TextMate grammars. Of the features listed above, conditionals were used in three regexes. The rest weren't used at all. Some Oniguruma features are so exotic that they're *used* zero times in all of public GitHub. + Contributions are welcome if you want to add support for currently unsupported features. diff --git a/src/index.js b/src/index.js index d332d16..9330657 100644 --- a/src/index.js +++ b/src/index.js @@ -76,8 +76,8 @@ function toDetails(pattern, options) { const strategy = regexAst._strategy; if (useEmulationGroups || strategy) { result.options = { - ...(strategy ? {strategy} : null), - ...(useEmulationGroups ? {useEmulationGroups} : null), + ...(strategy && {strategy}), + ...(useEmulationGroups && {useEmulationGroups}), }; } return result; diff --git a/src/parse.js b/src/parse.js index 1d93b50..238ed5e 100644 --- a/src/parse.js +++ b/src/parse.js @@ -5,6 +5,7 @@ import {getOrCreate, r, throwIfNot} from './utils.js'; import {hasOnlyChild} from './utils-ast.js'; const AstTypes = { + AbsentFunction: 'AbsentFunction', Alternative: 'Alternative', Assertion: 'Assertion', Backreference: 'Backreference', @@ -26,6 +27,11 @@ const AstTypes = { Recursion: 'Recursion', }; +const AstAbsentFunctionKinds = { + // See + repeater: 'repeater', +}; + const AstAssertionKinds = { line_end: 'line_end', line_start: 'line_start', @@ -320,6 +326,10 @@ function parseGroupOpen(context, state) { getOrCreate(namedGroupsByName, node.name, []).push(node); } } + if (node.type === AstTypes.AbsentFunction && state.isInAbsentFunction) { + // Doesn't throw in Onig but produces weird results and is described as unsupported in docs + throw new Error('Nested absent function not supported by Oniguruma'); + } let nextToken = throwIfUnclosedGroup(tokens[context.current]); while (nextToken.type !== TokenTypes.GroupClose) { if (nextToken.type === TokenTypes.Alternator) { @@ -327,17 +337,19 @@ function parseGroupOpen(context, state) { // Skip the alternator context.current++; } else { - const alt = node.alternatives.at(-1); + const isAbsentFunction = node.type === AstTypes.AbsentFunction; const isLookbehind = node.kind === AstAssertionKinds.lookbehind; const isNegLookbehind = isLookbehind && node.negate; + const alt = node.alternatives.at(-1); const child = walk(alt, { ...state, + isInAbsentFunction: state.isInAbsentFunction || isAbsentFunction, isInLookbehind: state.isInLookbehind || isLookbehind, isInNegLookbehind: state.isInNegLookbehind || isNegLookbehind, }); - alt.elements.push(child); + // Centralized validation of lookbehind contents if ((isLookbehind || state.isInLookbehind) && !skipLookbehindValidation) { - // JS supports all features within lookbehind, but Onig doesn't. Absence functions of form + // JS supports all features within lookbehind, but Onig doesn't. Absent functions of form // `(?~|)` and `(?~|…)` are also invalid in lookbehind (the `(?~…)` and `(?~|…|…)` forms // are allowed), but all forms with `(?~|` throw since they aren't yet supported const msg = 'Lookbehind includes a pattern not allowed by Oniguruma'; @@ -355,6 +367,7 @@ function parseGroupOpen(context, state) { } } } + alt.elements.push(child); } nextToken = throwIfUnclosedGroup(tokens[context.current]); } @@ -434,6 +447,17 @@ function parseSubroutine(context) { return node; } +function createAbsentFunction(kind) { + if (kind !== AstAbsentFunctionKinds.repeater) { + throw new Error(`Unexpected absent function kind "${kind}"`); + } + return { + type: AstTypes.AbsentFunction, + kind, + alternatives: [createAlternative()], + }; +} + function createAlternative() { return { type: AstTypes.Alternative, @@ -447,7 +471,7 @@ function createAssertion(kind, options) { return { type: AstTypes.Assertion, kind, - ...(kind === AstAssertionKinds.word_boundary ? {negate} : null), + ...(kind === AstAssertionKinds.word_boundary && {negate}), }; } @@ -478,6 +502,8 @@ function createBackreference(ref, options) { function createByGroupKind({flags, kind, name, negate, number}) { switch (kind) { + case TokenGroupKinds.absent_repeater: + return createAbsentFunction(AstAbsentFunctionKinds.repeater); case TokenGroupKinds.atomic: return createGroup({atomic: true}); case TokenGroupKinds.capturing: @@ -634,7 +660,7 @@ function createPattern() { }; } -function createQuantifier(element, min, max, greedy, possessive) { +function createQuantifier(element, min, max, greedy = true, possessive = false) { const node = { type: AstTypes.Quantifier, min, @@ -766,11 +792,13 @@ function throwIfUnclosedGroup(token) { } export { + AstAbsentFunctionKinds, AstAssertionKinds, AstCharacterSetKinds, AstDirectiveKinds, AstTypes, AstVariableLengthCharacterSetKinds, + createAbsentFunction, createAlternative, createAssertion, createBackreference, diff --git a/src/subclass.js b/src/subclass.js index ea59377..d4f7e99 100644 --- a/src/subclass.js +++ b/src/subclass.js @@ -72,8 +72,8 @@ class EmulatedRegExp extends RegExpSubclass { pattern, flags: flags ?? '', options: { - ...(opts.strategy ? {strategy: opts.strategy} : null), - ...(opts.useEmulationGroups ? {useEmulationGroups: true} : null), + ...(opts.strategy && {strategy: opts.strategy}), + ...(opts.useEmulationGroups && {useEmulationGroups: true}), }, }; } diff --git a/src/tokenize.js b/src/tokenize.js index 1d7f63f..042479d 100644 --- a/src/tokenize.js +++ b/src/tokenize.js @@ -40,7 +40,7 @@ const TokenDirectiveKinds = { }; const TokenGroupKinds = { - absence: 'absence', + absent_repeater: 'absent_repeater', atomic: 'atomic', capturing: 'capturing', group: 'group', @@ -358,15 +358,15 @@ function getTokenWithDetails(context, pattern, m, lastIndex) { } return { token, - } + }; } if (m2 === '~') { if (m === '(?~|') { - throw new Error(`Unsupported absence function type "${m}"`); + throw new Error(`Unsupported absent function kind "${m}"`); } return { token: createToken(TokenTypes.GroupOpen, m, { - kind: TokenGroupKinds.absence, + kind: TokenGroupKinds.absent_repeater, }), }; } diff --git a/src/transform.js b/src/transform.js index 0a898ac..54d4854 100644 --- a/src/transform.js +++ b/src/transform.js @@ -1,5 +1,5 @@ import {Accuracy, Target} from './options.js'; -import {AstAssertionKinds, AstCharacterSetKinds, AstDirectiveKinds, AstTypes, AstVariableLengthCharacterSetKinds, createAlternative, createAssertion, createBackreference, createCapturingGroup, createCharacterSet, createGroup, createLookaround, createUnicodeProperty, parse} from './parse.js'; +import {AstAssertionKinds, AstCharacterSetKinds, AstDirectiveKinds, AstTypes, AstVariableLengthCharacterSetKinds, createAlternative, createAssertion, createBackreference, createCapturingGroup, createCharacterSet, createGroup, createLookaround, createQuantifier, createUnicodeProperty, parse} from './parse.js'; import {tokenize} from './tokenize.js'; import {traverse} from './traverse.js'; import {JsUnicodeProperties, PosixClassesMap} from './unicode.js'; @@ -100,6 +100,17 @@ function transform(ast, options) { } const FirstPassVisitor = { + AbsentFunction({node, replaceWith}) { + // Convert absent repeater `(?~…)` to `(?:(?:(?!…)\p{Any})*)` + const group = prepContainer(createGroup(), [ + adoptAndSwapKids(createLookaround({negate: true}), node.alternatives), + createUnicodeProperty('Any'), + ]); + const quantifier = createQuantifier(group, 0, Infinity); + group.parent = quantifier; + replaceWith(prepContainer(createGroup(), [quantifier])); + }, + Alternative: { enter({node, parent, key}, {flagDirectivesByAlt}) { // Look for own-level flag directives when entering an alternative because after traversing @@ -587,7 +598,7 @@ const ThirdPassVisitor = { if (!participants.length) { // If no participating capture, convert backref to to `(?!)`; backrefs to nonparticipating // groups can't match in Onig but match the empty string in JS - replaceWith(createLookaround({negate: true})); + replaceWith(prepContainer(createLookaround({negate: true}))); } else if (participants.length > 1) { // Multiplex const alts = participants.map(reffed => adoptAndSwapKids( @@ -910,6 +921,5 @@ function traverseReplacement(replacement, {parent, key, container}, state, visit } export { - adoptAndSwapKids, transform, }; diff --git a/src/traverse.js b/src/traverse.js index 2ee7430..9ce918a 100644 --- a/src/traverse.js +++ b/src/traverse.js @@ -75,6 +75,7 @@ function traverse(path, state, visitor) { case AstTypes.Subroutine: case AstTypes.VariableLengthCharacterSet: break; + case AstTypes.AbsentFunction: case AstTypes.CapturingGroup: case AstTypes.Group: case AstTypes.Pattern: