From 368364d47b8233016d104bf4824495714ef5ce07 Mon Sep 17 00:00:00 2001 From: leaysgur <6259812+leaysgur@users.noreply.github.com> Date: Tue, 20 Aug 2024 02:19:24 +0000 Subject: [PATCH] feat(regex_parser): Implement `RegExp` parser (#3824) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Part of #1164 ## Progress updates 🗞️ Waiting for the review and advice, while thinking how to handle escaped string when `new RegExp(pat)`. ## TODOs - [x] `RegExp(Literal = Body + Flags)#parse()` structure - [x] Base `Reader` impl to handle both unicode(u32) and utf-16(u16) units - [x] Global `Span` and local offset conversion - [x] Design AST shapes - [x] Keep `enum` size small by `Box<'a, T>` - [x] Rework AST shapes - [x] Split body and flags w/ validating literal - [x] Parse `RegExpFlags` - [x] Parse `RegExpBody` = `Pattern` - [x] Parse `Pattern` > `Disjunction` - [x] Parse `Disjunction` > `Alternative` - [x] Parse `Alternative` > `Term` - [x] Parse `Term` > `Assertion` - [x] Parse `BoundaryAssertion` - [x] Parse `LookaroundAssertion` - [x] Parse `Term` > `Quantifier` - [x] Parse `Term` > `Atom` - [x] Parse `Atom` > `PatternCharacter` - [x] Parse `Atom` > `.` - [x] Parse `Atom` > `\AtomEscape` - [x] Parse `\AtomEscape` > `DecimalEscape` - [x] Parse `\AtomEscape` > `CharacterClassEscape` - [x] Parse `CharacterClassEscape` > `\d, \D, \s, \S, \w, \W` - [x] Parse `CharacterClassEscape` > `\p{UnicodePropertyValueExpression}, \P{UnicodePropertyValueExpression}` - [x] Parse `\AtomEscape` > `CharacterEscape` - [x] Parse `CharacterEscape` > `ControlEscape` - [x] Parse `CharacterEscape` > `c AsciiLetter` - [x] Parse `CharacterEscape` > `0` - [x] Parse `CharacterEscape` > `HexEscapeSequence` - [x] Parse `CharacterEscape` > `RegExpUnicodeEscapeSequence` - [x] Parse `CharacterEscape` > `IdentityEscape` - [x] Parse `\AtomEscape` > `kGroupName` - [x] Parse `Atom` > `[CharacterClass]` - [x] Parse `[CharacterClass]` > `ClassContents` > `[~UnicodeSetsMode] NonemptyClassRanges` - [x] Parse `[CharacterClass]` > `ClassContents` > `[+UnicodeSetsMode] ClassSetExpression` - [x] Parse `ClassSetExpression` > `ClassUnion` - [x] Parse `ClassSetExpression` > `ClassIntersection` - [x] Parse `ClassSetExpression` > `ClassSubtraction` - [x] Parse `ClassSetExpression` > `ClassSetOperand` - [x] Parse `ClassSetExpression` > `ClassSetRange` - [x] Parse `ClassSetExpression` > `ClassSetCharacter` - [x] Parse `Atom` > `(GroupSpecifier)` - [x] Parse `Atom` > `(?:Disjunction)` - [x] Annex B - [x] Parse `QuantifiableAssertion` - [x] Parse `ExtendedAtom` - [x] Parse `ExtendedAtom` > `\ [lookahead = c]` - [x] Parse `ExtendedAtom` > `InvalidBracedQuantifier` - [x] Parse `ExtendedAtom` > `ExtendedPatternCharacter` - [x] Parse `ExtendedAtom` > `\AtomEscape` > `CharacterEscape` > `LegacyOctalEscapeSequence` - [x] Early errors - [x] Pattern :: Disjunction(1/2) - [x] Pattern :: Disjunction(2/2) - [x] QuantifierPrefix :: { DecimalDigits , DecimalDigits } - [x] ExtendedAtom :: InvalidBracedQuantifier (Annex B) - [x] AtomEscape :: k GroupName - [x] AtomEscape :: DecimalEscape - [x] NonemptyClassRanges :: ClassAtom - ClassAtom ClassContents(1/2) - [x] NonemptyClassRanges :: ClassAtom - ClassAtom ClassContents(2/2) - [x] NonemptyClassRanges :: ClassAtom - ClassAtom ClassContents(Annex B) - [x] NonemptyClassRangesNoDash :: ClassAtomNoDash - ClassAtom ClassContents(1/2) - [x] NonemptyClassRangesNoDash :: ClassAtomNoDash - ClassAtom ClassContents(2/2) - [x] NonemptyClassRangesNoDash :: ClassAtomNoDash - ClassAtom ClassContents(Annex B) - [x] RegExpIdentifierStart :: \ RegExpUnicodeEscapeSequence - [x] RegExpIdentifierStart :: UnicodeLeadSurrogate UnicodeTrailSurrogate - [x] RegExpIdentifierPart :: \ RegExpUnicodeEscapeSequence - [x] RegExpIdentifierPart :: UnicodeLeadSurrogate UnicodeTrailSurrogate - [x] UnicodePropertyValueExpression :: UnicodePropertyName = UnicodePropertyValue(1/2) - [x] UnicodePropertyValueExpression :: UnicodePropertyName = UnicodePropertyValue(2/2) - [x] UnicodePropertyValueExpression :: LoneUnicodePropertyNameOrValue(1/2) - [x] UnicodePropertyValueExpression :: LoneUnicodePropertyNameOrValue(2/2) - [x] CharacterClassEscape :: P{ UnicodePropertyValueExpression } - [x] CharacterClass :: [^ ClassContents ] - [x] NestedClass :: [^ ClassContents ] - [x] ClassSetRange :: ClassSetCharacter - ClassSetCharacter - [x] Add `Span` to `Err(OxcDiagnostic::error())` calls - [x] Perf improvement - [x] `Reader#peek()` should avoid `iter.next()` equivalent - [x] ~~Use `char` everywhere and split and push 2 surrogates(pair) for `Character`?~~ - [x] ~~Try 1(+1) loop parsing for capturing groups?~~ ## Follow up - [x] @Boshen Test suite > #4242 - [x] Investigate CI errors... - Next... - Support ES2025 Duplicate named capturing groups? - Support ES20XX Stage3 Modifiers? --- .typos.toml | 5 +- Cargo.lock | 20 +- crates/oxc_js_regex/Cargo.toml | 24 - crates/oxc_js_regex/README.md | 5 - crates/oxc_js_regex/src/ast.rs | 387 --- crates/oxc_js_regex/src/lexer/mod.rs | 1 - crates/oxc_js_regex/src/lexer/token.rs | 1 - crates/oxc_js_regex/src/lib.rs | 5 - crates/oxc_js_regex/src/parser.rs | 1 - crates/oxc_js_regex/src/validator.rs | 1 - crates/oxc_js_regex/src/visitor.rs | 1 - crates/oxc_regexp_parser/Cargo.toml | 29 + crates/oxc_regexp_parser/README.md | 8 + .../examples/parse_literal.rs | 63 + crates/oxc_regexp_parser/src/ast.rs | 262 +++ .../oxc_regexp_parser/src/body_parser/mod.rs | 251 ++ .../src/body_parser/parser.rs | 2092 +++++++++++++++++ .../src/body_parser/reader.rs | 247 ++ .../src/body_parser/state.rs | 141 ++ .../src/body_parser/unicode.rs | 146 ++ .../src/body_parser/unicode_property.rs | 354 +++ crates/oxc_regexp_parser/src/flag_parser.rs | 69 + crates/oxc_regexp_parser/src/lib.rs | 13 + .../oxc_regexp_parser/src/literal_parser.rs | 161 ++ crates/oxc_regexp_parser/src/options.rs | 25 + crates/oxc_regexp_parser/src/span.rs | 16 + 26 files changed, 3893 insertions(+), 435 deletions(-) delete mode 100644 crates/oxc_js_regex/Cargo.toml delete mode 100644 crates/oxc_js_regex/README.md delete mode 100644 crates/oxc_js_regex/src/ast.rs delete mode 100644 crates/oxc_js_regex/src/lexer/mod.rs delete mode 100644 crates/oxc_js_regex/src/lexer/token.rs delete mode 100644 crates/oxc_js_regex/src/lib.rs delete mode 100644 crates/oxc_js_regex/src/parser.rs delete mode 100644 crates/oxc_js_regex/src/validator.rs delete mode 100644 crates/oxc_js_regex/src/visitor.rs create mode 100644 crates/oxc_regexp_parser/Cargo.toml create mode 100644 crates/oxc_regexp_parser/README.md create mode 100644 crates/oxc_regexp_parser/examples/parse_literal.rs create mode 100644 crates/oxc_regexp_parser/src/ast.rs create mode 100644 crates/oxc_regexp_parser/src/body_parser/mod.rs create mode 100644 crates/oxc_regexp_parser/src/body_parser/parser.rs create mode 100644 crates/oxc_regexp_parser/src/body_parser/reader.rs create mode 100644 crates/oxc_regexp_parser/src/body_parser/state.rs create mode 100644 crates/oxc_regexp_parser/src/body_parser/unicode.rs create mode 100644 crates/oxc_regexp_parser/src/body_parser/unicode_property.rs create mode 100644 crates/oxc_regexp_parser/src/flag_parser.rs create mode 100644 crates/oxc_regexp_parser/src/lib.rs create mode 100644 crates/oxc_regexp_parser/src/literal_parser.rs create mode 100644 crates/oxc_regexp_parser/src/options.rs create mode 100644 crates/oxc_regexp_parser/src/span.rs diff --git a/.typos.toml b/.typos.toml index e427c79d81d52..430d43141fae0 100644 --- a/.typos.toml +++ b/.typos.toml @@ -23,7 +23,10 @@ extend-exclude = [ ] [default] -extend-ignore-re = ["(?Rm)^.*(#|//)\\s*spellchecker:disable-line$"] +extend-ignore-re = [ + "(?Rm)^.*(#|//)\\s*spellchecker:disable-line$", + "(?s)(#|//)\\s*spellchecker:off.*?\\n\\s*(#|//)\\s*spellchecker:on", +] [default.extend-words] trivias = "trivias" diff --git a/Cargo.lock b/Cargo.lock index 1efc4d4331610..866b4300e8185 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1563,14 +1563,6 @@ dependencies = [ "rustc-hash", ] -[[package]] -name = "oxc_js_regex" -version = "0.0.0" -dependencies = [ - "oxc_allocator", - "oxc_span", -] - [[package]] name = "oxc_language_server" version = "0.0.1" @@ -1773,6 +1765,18 @@ dependencies = [ "walkdir", ] +[[package]] +name = "oxc_regexp_parser" +version = "0.0.0" +dependencies = [ + "oxc_allocator", + "oxc_diagnostics", + "oxc_span", + "phf 0.11.2", + "rustc-hash", + "unicode-id-start", +] + [[package]] name = "oxc_resolver" version = "1.10.2" diff --git a/crates/oxc_js_regex/Cargo.toml b/crates/oxc_js_regex/Cargo.toml deleted file mode 100644 index 4027e4610181e..0000000000000 --- a/crates/oxc_js_regex/Cargo.toml +++ /dev/null @@ -1,24 +0,0 @@ -[package] -name = "oxc_js_regex" -version = "0.0.0" -publish = false -authors = ["Ubugeeei "] -categories.workspace = true -description.workspace = true -edition.workspace = true -homepage.workspace = true -keywords.workspace = true -license.workspace = true -repository.workspace = true -rust-version.workspace = true -include = ["/src"] - -[lints] -workspace = true - -[lib] -doctest = false - -[dependencies] -oxc_allocator = { workspace = true } -oxc_span = { workspace = true } diff --git a/crates/oxc_js_regex/README.md b/crates/oxc_js_regex/README.md deleted file mode 100644 index f1e6b2fc5370f..0000000000000 --- a/crates/oxc_js_regex/README.md +++ /dev/null @@ -1,5 +0,0 @@ -# oxc_js_regex - -⚠️ Work in progress. Do not use yet. - -see: https://github.com/oxc-project/oxc/issues/1164 \ No newline at end of file diff --git a/crates/oxc_js_regex/src/ast.rs b/crates/oxc_js_regex/src/ast.rs deleted file mode 100644 index 54f06cbc40a6d..0000000000000 --- a/crates/oxc_js_regex/src/ast.rs +++ /dev/null @@ -1,387 +0,0 @@ -//! [`@eslint-community/regexpp`](https://github.com/eslint-community/regexpp/blob/2e8f1af992fb12eae46a446253e8fa3f6cede92a/src/ast.ts) - -use oxc_allocator::{Box, Vec}; -use oxc_span::{CompactStr, Span}; - -/// The type which includes all nodes. -#[derive(Debug)] -pub enum Node<'a> { - Branch(Box<'a, Branch<'a>>), - Leaf(Box<'a, Leaf<'a>>), -} - -/// The type which includes all branch nodes. -#[derive(Debug)] -pub enum Branch<'a> { - Alternative(Box<'a, Alternative<'a>>), - CapturingGroup(Box<'a, CapturingGroup<'a>>), - CharacterClass(Box<'a, CharacterClass<'a>>), - CharacterClassRange(Box<'a, CharacterClassRange>), - ClassIntersection(Box<'a, ClassIntersection<'a>>), - ClassStringDisjunction(Box<'a, ClassStringDisjunction<'a>>), - ClassSubtraction(Box<'a, ClassSubtraction<'a>>), - ExpressionCharacterClass(Box<'a, ExpressionCharacterClass<'a>>), - Group(Box<'a, Group<'a>>), - LookaroundAssertion(Box<'a, LookaroundAssertion<'a>>), - Pattern(Box<'a, Pattern<'a>>), - Quantifier(Box<'a, Quantifier<'a>>), - RegExpLiteral(Box<'a, RegExpLiteral<'a>>), - StringAlternative(Box<'a, StringAlternative<'a>>), -} - -/// The type which includes all leaf nodes. -#[derive(Debug)] -pub enum Leaf<'a> { - Backreference(Box<'a, Backreference<'a>>), - BoundaryAssertion(Box<'a, BoundaryAssertion<'a>>), - Character(Box<'a, Character>), - CharacterSet(Box<'a, CharacterSet<'a>>), - Flags(Box<'a, Flags>), -} - -/// The type which includes all atom nodes. -#[derive(Debug)] -pub enum Element<'a> { - Assertion(Box<'a, Assertion<'a>>), - QuantifiableElement(Box<'a, QuantifiableElement<'a>>), - Quantifier(Box<'a, Quantifier<'a>>), -} - -/// The type which includes all atom nodes that Quantifier node can have as children. -#[derive(Debug)] -pub enum QuantifiableElement<'a> { - Backreference(Box<'a, Backreference<'a>>), - CapturingGroup(Box<'a, CapturingGroup<'a>>), - Character(Box<'a, Character>), - CharacterClass(Box<'a, CharacterClass<'a>>), - CharacterSet(Box<'a, CharacterSet<'a>>), - ExpressionCharacterClass(Box<'a, ExpressionCharacterClass<'a>>), - Group(Box<'a, Group<'a>>), - LookaheadAssertion(Box<'a, LookaheadAssertion<'a>>), -} - -/// The type which includes all character class atom nodes. -#[derive(Debug)] -pub enum CharacterClassElement<'a> { - ClassRangesCharacterClassElement(Box<'a, ClassRangesCharacterClassElement<'a>>), - UnicodeSetsCharacterClassElement(Box<'a, UnicodeSetsCharacterClassElement<'a>>), -} -#[derive(Debug)] -pub enum ClassRangesCharacterClassElement<'a> { - Character(Box<'a, Character>), - CharacterClassRange(Box<'a, CharacterClassRange>), - CharacterUnicodePropertyCharacterSet(Box<'a, CharacterUnicodePropertyCharacterSet>), - EscapeCharacterSet(Box<'a, EscapeCharacterSet>), -} -#[derive(Debug)] -pub enum UnicodeSetsCharacterClassElement<'a> { - Character(Box<'a, Character>), - CharacterClassRange(Box<'a, CharacterClassRange>), - ClassStringDisjunction(Box<'a, ClassStringDisjunction<'a>>), - EscapeCharacterSet(Box<'a, EscapeCharacterSet>), - ExpressionCharacterClass(Box<'a, ExpressionCharacterClass<'a>>), - UnicodePropertyCharacterSet(Box<'a, UnicodePropertyCharacterSet<'a>>), - UnicodeSetsCharacterClass(Box<'a, UnicodeSetsCharacterClass<'a>>), -} - -/// The root node. -#[derive(Debug)] -pub struct RegExpLiteral<'a> { - pub span: Span, - pub pattern: Pattern<'a>, - pub flags: Flags, -} - -/// The pattern. -#[derive(Debug)] -pub struct Pattern<'a> { - pub span: Span, - pub alternatives: Vec<'a, Alternative<'a>>, -} - -/// The alternative. -/// E.g. `a|b` -#[derive(Debug)] -pub struct Alternative<'a> { - pub span: Span, - pub elements: Vec<'a, Element<'a>>, -} - -/// The uncapturing group. -/// E.g. `(?:ab)` -#[derive(Debug)] -pub struct Group<'a> { - pub span: Span, - pub alternatives: Vec<'a, Alternative<'a>>, -} - -/// The capturing group. -/// E.g. `(ab)`, `(?ab)` -#[derive(Debug)] -pub struct CapturingGroup<'a> { - pub span: Span, - pub name: Option, - pub alternatives: Vec<'a, Alternative<'a>>, - pub references: Vec<'a, Backreference<'a>>, -} - -/// The lookaround assertion. -#[derive(Debug)] -pub enum LookaroundAssertion<'a> { - LookaheadAssertion(Box<'a, LookaheadAssertion<'a>>), - LookbehindAssertion(Box<'a, LookbehindAssertion<'a>>), -} - -/// The lookahead assertion. -/// E.g. `(?=ab)`, `(?!ab)` -#[derive(Debug)] -pub struct LookaheadAssertion<'a> { - pub span: Span, - pub negate: bool, - pub alternatives: Vec<'a, Alternative<'a>>, -} - -/// The lookbehind assertion. -/// E.g. `(?<=ab)`, `(? { - pub span: Span, - pub negate: bool, - pub alternatives: Vec<'a, Alternative<'a>>, -} - -/// The quantifier. -/// E.g. `a?`, `a*`, `a+`, `a{1,2}`, `a??`, `a*?`, `a+?`, `a{1,2}?` -#[derive(Debug)] -pub struct Quantifier<'a> { - pub span: Span, - pub min: f64, - pub max: f64, // can be f64::INFINITY - pub greedy: bool, - pub element: QuantifiableElement<'a>, -} - -/// The character class. -/// E.g. `[ab]`, `[^ab]` -#[derive(Debug)] -pub enum CharacterClass<'a> { - ClassRangesCharacterClass(Box<'a, ClassRangesCharacterClass<'a>>), - UnicodeSetsCharacterClass(Box<'a, UnicodeSetsCharacterClass<'a>>), -} - -/// The character class used in legacy (neither `u` nor `v` flag) and Unicode mode (`u` flag). -/// This character class is guaranteed to **not** contain strings. -/// In Unicode sets mode (`v` flag), {@link UnicodeSetsCharacterClass} is used. -#[derive(Debug)] -pub struct ClassRangesCharacterClass<'a> { - pub span: Span, - pub unicode_sets: bool, - pub elements: Vec<'a, ClassRangesCharacterClassElement<'a>>, -} - -/// The character class used in Unicode sets mode (`v` flag). -/// This character class may contain strings. -#[derive(Debug)] -pub struct UnicodeSetsCharacterClass<'a> { - pub span: Span, - pub elements: Vec<'a, UnicodeSetsCharacterClassElement<'a>>, -} - -/// The character class. -/// E.g. `[a-b]` -#[derive(Debug)] -pub struct CharacterClassRange { - pub span: Span, - pub min: Character, - pub max: Character, -} - -/// The assertion. -#[derive(Debug)] -pub enum Assertion<'a> { - BoundaryAssertion(Box<'a, BoundaryAssertion<'a>>), - LookaroundAssertion(Box<'a, LookaroundAssertion<'a>>), -} - -/// The boundary assertion. -#[derive(Debug)] -pub enum BoundaryAssertion<'a> { - EdgeAssertion(Box<'a, EdgeAssertion>), - WordBoundaryAssertion(Box<'a, WordBoundaryAssertion>), -} - -/// The edge boundary assertion. -/// E.g. `^`, `$` -#[derive(Debug)] -pub struct EdgeAssertion { - pub span: Span, - pub kind: EdgeAssertionKind, -} - -#[derive(Debug)] -pub enum EdgeAssertionKind { - Start, - End, -} - -/// The word boundary assertion. -/// E.g. `\b`, `\B` -#[derive(Debug)] -pub struct WordBoundaryAssertion { - pub span: Span, - pub negate: bool, -} - -/// The character set. -#[derive(Debug)] -pub enum CharacterSet<'a> { - AnyCharacterSet, - EscapeCharacterSet(Box<'a, EscapeCharacterSet>), - UnicodePropertyCharacterSet(Box<'a, UnicodePropertyCharacterSet<'a>>), -} - -/// The character class escape. -/// E.g. `\d`, `\s`, `\w`, `\D`, `\S`, `\W` -#[derive(Debug)] -pub struct EscapeCharacterSet { - pub span: Span, - pub kind: EscapeCharacterSetKind, - pub negate: bool, -} - -#[derive(Debug)] -pub enum EscapeCharacterSetKind { - Digit, - Space, - Word, -} - -/// The unicode property escape. -/// E.g. `\p{ASCII}`, `\P{ASCII}`, `\p{Script=Hiragana}` -#[derive(Debug)] -pub enum UnicodePropertyCharacterSet<'a> { - CharacterUnicodePropertyCharacterSet(Box<'a, CharacterUnicodePropertyCharacterSet>), - StringsUnicodePropertyCharacterSet(Box<'a, StringsUnicodePropertyCharacterSet>), -} - -#[derive(Debug)] -pub struct CharacterUnicodePropertyCharacterSet { - pub span: Span, - pub key: CompactStr, - pub value: Option, - pub negate: bool, -} - -/// StringsUnicodePropertyCharacterSet is Unicode property escape with property of strings. -#[derive(Debug)] -pub struct StringsUnicodePropertyCharacterSet { - pub span: Span, - pub key: CompactStr, -} - -/// The expression character class. -/// E.g. `[a--b]`, `[a&&b]`,`[^a--b]`, `[^a&&b]` -#[derive(Debug)] -pub struct ExpressionCharacterClass<'a> { - pub span: Span, - pub negate: bool, - pub expression: ExpressionCharacterClassExpr<'a>, -} - -#[derive(Debug)] -pub enum ExpressionCharacterClassExpr<'a> { - ClassIntersection(Box<'a, ClassIntersection<'a>>), - ClassSubtraction(Box<'a, ClassSubtraction<'a>>), -} - -#[derive(Debug)] -pub enum ClassSetOperand<'a> { - Character(Box<'a, Character>), - ClassStringDisjunction(Box<'a, ClassStringDisjunction<'a>>), - EscapeCharacterSet(Box<'a, EscapeCharacterSet>), - ExpressionCharacterClass(Box<'a, ExpressionCharacterClass<'a>>), - UnicodePropertyCharacterSet(Box<'a, UnicodePropertyCharacterSet<'a>>), - UnicodeSetsCharacterClass(Box<'a, UnicodeSetsCharacterClass<'a>>), -} - -/// The character class intersection. -/// E.g. `a&&b` -#[derive(Debug)] -pub struct ClassIntersection<'a> { - pub span: Span, - pub left: ClassIntersectionLeft<'a>, - pub right: ClassSetOperand<'a>, -} - -#[derive(Debug)] -pub enum ClassIntersectionLeft<'a> { - ClassIntersection(Box<'a, ClassIntersection<'a>>), - ClassSetOperand(Box<'a, ClassSetOperand<'a>>), -} - -/// The character class subtraction. -/// E.g. `a--b` -#[derive(Debug)] -pub struct ClassSubtraction<'a> { - pub span: Span, - pub left: ClassSubtractionLeft<'a>, - pub right: ClassSetOperand<'a>, -} - -#[derive(Debug)] -pub enum ClassSubtractionLeft<'a> { - ClassSetOperand(Box<'a, ClassSetOperand<'a>>), - ClassSubtraction(Box<'a, ClassSubtraction<'a>>), -} - -/// The character class string disjunction. -/// E.g. `\q{a|b}` -#[derive(Debug)] -pub struct ClassStringDisjunction<'a> { - pub span: Span, - pub alternatives: Vec<'a, StringAlternative<'a>>, -} - -/// StringAlternative is only used for `\q{alt}`({@link ClassStringDisjunction}). -#[derive(Debug)] -pub struct StringAlternative<'a> { - pub span: Span, - pub elements: Vec<'a, Character>, -} - -/// This includes escape sequences which mean a character. -/// E.g. `a`, `あ`, `✿`, `\x65`, `\u0065`, `\u{65}`, `\/` -#[derive(Debug)] -pub struct Character { - pub span: Span, - pub value: u16, // UTF-16 code point -} - -#[derive(Debug)] -pub enum BackreferenceRef { - Number(i32), - CompactStr(CompactStr), -} - -/// The backreference. -/// E.g. `\1`, `\k` -#[derive(Debug)] -pub struct Backreference<'a> { - pub span: Span, - pub reference: BackreferenceRef, - pub resolved: CapturingGroup<'a>, -} - -/// The flags. -#[derive(Debug)] -pub struct Flags { - pub span: Span, - pub dot_all: bool, - pub global: bool, - pub has_indices: bool, - pub ignore_case: bool, - pub multiline: bool, - pub sticky: bool, - pub unicode: bool, - pub unicode_sets: bool, -} diff --git a/crates/oxc_js_regex/src/lexer/mod.rs b/crates/oxc_js_regex/src/lexer/mod.rs deleted file mode 100644 index 40d3ff585686a..0000000000000 --- a/crates/oxc_js_regex/src/lexer/mod.rs +++ /dev/null @@ -1 +0,0 @@ -mod token; diff --git a/crates/oxc_js_regex/src/lexer/token.rs b/crates/oxc_js_regex/src/lexer/token.rs deleted file mode 100644 index 8b137891791fe..0000000000000 --- a/crates/oxc_js_regex/src/lexer/token.rs +++ /dev/null @@ -1 +0,0 @@ - diff --git a/crates/oxc_js_regex/src/lib.rs b/crates/oxc_js_regex/src/lib.rs deleted file mode 100644 index 6647fb03be8f5..0000000000000 --- a/crates/oxc_js_regex/src/lib.rs +++ /dev/null @@ -1,5 +0,0 @@ -pub mod ast; -mod lexer; -pub mod parser; -pub mod validator; -pub mod visitor; diff --git a/crates/oxc_js_regex/src/parser.rs b/crates/oxc_js_regex/src/parser.rs deleted file mode 100644 index 8b137891791fe..0000000000000 --- a/crates/oxc_js_regex/src/parser.rs +++ /dev/null @@ -1 +0,0 @@ - diff --git a/crates/oxc_js_regex/src/validator.rs b/crates/oxc_js_regex/src/validator.rs deleted file mode 100644 index 8b137891791fe..0000000000000 --- a/crates/oxc_js_regex/src/validator.rs +++ /dev/null @@ -1 +0,0 @@ - diff --git a/crates/oxc_js_regex/src/visitor.rs b/crates/oxc_js_regex/src/visitor.rs deleted file mode 100644 index 8b137891791fe..0000000000000 --- a/crates/oxc_js_regex/src/visitor.rs +++ /dev/null @@ -1 +0,0 @@ - diff --git a/crates/oxc_regexp_parser/Cargo.toml b/crates/oxc_regexp_parser/Cargo.toml new file mode 100644 index 0000000000000..b21c9ac60b14e --- /dev/null +++ b/crates/oxc_regexp_parser/Cargo.toml @@ -0,0 +1,29 @@ +[package] +name = "oxc_regexp_parser" +version = "0.0.0" +publish = false +authors.workspace = true +categories.workspace = true +description.workspace = true +edition.workspace = true +homepage.workspace = true +keywords.workspace = true +license.workspace = true +repository.workspace = true +rust-version.workspace = true +include = ["/examples", "/src"] + +[lints] +workspace = true + +[lib] +doctest = false + +[dependencies] +oxc_allocator = { workspace = true } +oxc_diagnostics = { workspace = true } +oxc_span = { workspace = true } + +phf = { workspace = true, features = ["macros"] } +rustc-hash = { workspace = true } +unicode-id-start = { workspace = true } diff --git a/crates/oxc_regexp_parser/README.md b/crates/oxc_regexp_parser/README.md new file mode 100644 index 0000000000000..974bd3612620f --- /dev/null +++ b/crates/oxc_regexp_parser/README.md @@ -0,0 +1,8 @@ +# oxc_regexp_parser + +Implements ECMAScript® 2024 Language Specification + +- https://tc39.es/ecma262/2024/multipage/ecmascript-language-lexical-grammar.html#sec-literals-regular-expression-literals +- https://tc39.es/ecma262/2024/multipage/text-processing.html#sec-regexp-regular-expression-objects +- https://tc39.es/ecma262/2024/multipage/additional-ecmascript-features-for-web-browsers.html#sec-regular-expressions-patterns + diff --git a/crates/oxc_regexp_parser/examples/parse_literal.rs b/crates/oxc_regexp_parser/examples/parse_literal.rs new file mode 100644 index 0000000000000..7d76d87be4f6f --- /dev/null +++ b/crates/oxc_regexp_parser/examples/parse_literal.rs @@ -0,0 +1,63 @@ +#![allow(clippy::print_stdout)] + +use oxc_allocator::Allocator; +use oxc_regexp_parser::{ast, Parser, ParserOptions}; + +fn main() { + let allocator = Allocator::default(); + + for source_text in [ + "/ab/", + "/abc/i", + "/abcd/igv", + "/emo👈🏻ji/u", + "/ab|c/i", + "/a|b+|c/i", + "/a{0}|b{1,2}|c{3,}/i", + "/(?=a)|(?<=b)|(?!c)|(?x\1c/u", + r"/(cg)(?cg)(?:g)/", + r"/{3}/", // Error + r"/Em🥹j/", + r"/^(?=ab)\b(?!cd)(?<=ef)\B(?)(?)/", // Error + r"/(?noname)/v", // Error + r"/[\bb]/", + ] { + println!("Parse: {source_text}"); + let parser = Parser::new(&allocator, source_text, ParserOptions::default()); + let ret = parser.parse(); + + match ret { + Ok(ast::RegExpLiteral { pattern, flags, .. }) => { + println!("✨ {}", pattern.span.source_text(source_text)); + println!("{pattern:#?}"); + println!("✨ {}", flags.span.source_text(source_text)); + println!("{flags:?}"); + } + Err(error) => { + let error = error.with_source_code(source_text); + println!("💥 {error:?}"); + } + } + println!(); + } +} diff --git a/crates/oxc_regexp_parser/src/ast.rs b/crates/oxc_regexp_parser/src/ast.rs new file mode 100644 index 0000000000000..5dabd1cff0533 --- /dev/null +++ b/crates/oxc_regexp_parser/src/ast.rs @@ -0,0 +1,262 @@ +use oxc_allocator::{Box, Vec}; +use oxc_span::{Atom as SpanAtom, Span}; + +#[derive(Debug)] +pub struct RegExpLiteral<'a> { + pub span: Span, + pub pattern: Pattern<'a>, + pub flags: Flags, +} + +#[derive(Debug)] +pub struct Flags { + pub span: Span, + pub global: bool, + pub ignore_case: bool, + pub multiline: bool, + pub unicode: bool, + pub sticky: bool, + pub dot_all: bool, + pub has_indices: bool, + pub unicode_sets: bool, +} + +/// The root of the `PatternParser` result. +#[derive(Debug)] +pub struct Pattern<'a> { + pub span: Span, + pub body: Disjunction<'a>, +} + +/// Pile of [`Alternative`]s separated by `|`. +#[derive(Debug)] +pub struct Disjunction<'a> { + pub span: Span, + pub body: Vec<'a, Alternative<'a>>, +} + +/// Single unit of `|` separated alternatives. +#[derive(Debug)] +pub struct Alternative<'a> { + pub span: Span, + pub body: Vec<'a, Term<'a>>, +} + +/// Single unit of [`Alternative`], containing various kinds. +#[derive(Debug)] +pub enum Term<'a> { + // Assertion, QuantifiableAssertion + BoundaryAssertion(BoundaryAssertion), + LookAroundAssertion(Box<'a, LookAroundAssertion<'a>>), + // Quantifier + Quantifier(Box<'a, Quantifier<'a>>), + // Atom, ExtendedAtom + Character(Character), + Dot(Dot), + CharacterClassEscape(CharacterClassEscape), + UnicodePropertyEscape(Box<'a, UnicodePropertyEscape<'a>>), + CharacterClass(Box<'a, CharacterClass<'a>>), + CapturingGroup(Box<'a, CapturingGroup<'a>>), + IgnoreGroup(Box<'a, IgnoreGroup<'a>>), + IndexedReference(IndexedReference), + NamedReference(Box<'a, NamedReference<'a>>), +} + +/// Simple form of assertion. +/// e.g. `^`, `$`, `\b`, `\B` +#[derive(Debug)] +pub struct BoundaryAssertion { + pub span: Span, + pub kind: BoundaryAssertionKind, +} +#[derive(Debug)] +pub enum BoundaryAssertionKind { + Start, + End, + Boundary, + NegativeBoundary, +} + +/// Lookaround assertion. +/// e.g. `(?=...)`, `(?!...)`, `(?<=...)`, `(? { + pub span: Span, + pub kind: LookAroundAssertionKind, + pub body: Disjunction<'a>, +} +#[derive(Debug)] +pub enum LookAroundAssertionKind { + Lookahead, + NegativeLookahead, + Lookbehind, + NegativeLookbehind, +} + +/// Quantifier holding a [`Term`] and its repetition count. +/// e.g. `a*`, `b+`, `c?`, `d{3}`, `e{4,}`, `f{5,6}` +#[derive(Debug)] +pub struct Quantifier<'a> { + pub span: Span, + pub min: u32, + /// `None` means no upper bound. + pub max: Option, + pub greedy: bool, + pub body: Term<'a>, +} + +/// Single character. +#[derive(Debug, Copy, Clone)] +pub struct Character { + /// This will be invalid position when `UnicodeMode` is disabled and `value` is a surrogate pair. + pub span: Span, + pub kind: CharacterKind, + /// Unicode code point or UTF-16 code unit. + pub value: u32, +} + +#[derive(Debug, Copy, Clone)] +pub enum CharacterKind { + ControlLetter, + HexadecimalEscape, + Identifier, + Null, + Octal, + SingleEscape, + Symbol, + UnicodeEscape, +} + +/// Character class. +/// e.g. `\d`, `\D`, `\s`, `\S`, `\w`, `\W` +#[derive(Debug)] +pub struct CharacterClassEscape { + pub span: Span, + pub kind: CharacterClassEscapeKind, +} + +#[derive(Debug)] +pub enum CharacterClassEscapeKind { + D, + NegativeD, + S, + NegativeS, + W, + NegativeW, +} + +/// Unicode property. +/// e.g. `\p{ASCII}`, `\P{ASCII}`, `\p{sc=Hiragana}`, `\P{sc=Hiragana}` +#[derive(Debug)] +pub struct UnicodePropertyEscape<'a> { + pub span: Span, + pub negative: bool, + /// `true` if `UnicodeSetsMode` and `name` matched unicode property of strings. + pub strings: bool, + pub name: SpanAtom<'a>, + pub value: Option>, +} + +/// The `.`. +#[derive(Debug)] +pub struct Dot { + pub span: Span, +} + +/// Character class wrapped by `[]`. +/// e.g. `[a-z]`, `[^A-Z]`, `[abc]`, `[a&&b&&c]`, `[[a-z]--x--y]` +#[derive(Debug)] +pub struct CharacterClass<'a> { + pub span: Span, + pub negative: bool, + pub kind: CharacterClassContentsKind, + pub body: Vec<'a, CharacterClassContents<'a>>, +} + +#[derive(Debug)] +pub enum CharacterClassContentsKind { + Union, + /// `UnicodeSetsMode` only. + Intersection, + /// `UnicodeSetsMode` only. + Subtraction, +} + +#[derive(Debug)] +pub enum CharacterClassContents<'a> { + CharacterClassRange(Box<'a, CharacterClassRange>), + CharacterClassEscape(CharacterClassEscape), + UnicodePropertyEscape(Box<'a, UnicodePropertyEscape<'a>>), + Character(Character), + /// `UnicodeSetsMode` only + NestedCharacterClass(Box<'a, CharacterClass<'a>>), + /// `UnicodeSetsMode` only + ClassStringDisjunction(Box<'a, ClassStringDisjunction<'a>>), +} + +/// `-` separated range of characters. +/// e.g. `a-z`, `A-Z`, `0-9` +#[derive(Debug)] +pub struct CharacterClassRange { + pub span: Span, + pub min: Character, + pub max: Character, +} + +/// `|` separated string of characters wrapped by `\q{}`. +#[derive(Debug)] +pub struct ClassStringDisjunction<'a> { + pub span: Span, + /// `true` if body is empty or contain 2 more characters. + pub strings: bool, + pub body: Vec<'a, ClassString<'a>>, +} + +/// Single unit of [`ClassStringDisjunction`]. +#[derive(Debug)] +pub struct ClassString<'a> { + pub span: Span, + pub body: Vec<'a, Character>, +} + +/// Named or unnamed capturing group. +/// e.g. `(...)`, `(?...)` +#[derive(Debug)] +pub struct CapturingGroup<'a> { + pub span: Span, + pub name: Option>, + pub body: Disjunction<'a>, +} + +/// Pseudo-group for ignoring. +/// e.g. `(?:...)` +#[derive(Debug)] +pub struct IgnoreGroup<'a> { + pub span: Span, + pub enabling_modifiers: Option, + pub disabling_modifiers: Option, + pub body: Disjunction<'a>, +} + +#[derive(Debug)] +pub struct ModifierFlags { + pub ignore_case: bool, + pub sticky: bool, + pub multiline: bool, +} + +/// Backreference by index. +/// e.g. `\1`, `\2`, `\3` +#[derive(Debug)] +pub struct IndexedReference { + pub span: Span, + pub index: u32, +} + +/// Backreference by name. +/// e.g. `\k` +#[derive(Debug)] +pub struct NamedReference<'a> { + pub span: Span, + pub name: SpanAtom<'a>, +} diff --git a/crates/oxc_regexp_parser/src/body_parser/mod.rs b/crates/oxc_regexp_parser/src/body_parser/mod.rs new file mode 100644 index 0000000000000..43e1a1d2ae547 --- /dev/null +++ b/crates/oxc_regexp_parser/src/body_parser/mod.rs @@ -0,0 +1,251 @@ +mod parser; +mod reader; +mod state; +mod unicode; +mod unicode_property; + +pub use parser::PatternParser; + +#[cfg(test)] +mod test { + use crate::{ParserOptions, PatternParser}; + use oxc_allocator::Allocator; + + // NOTE: These may be useless when integlation tests are added + #[test] + fn should_pass() { + let allocator = Allocator::default(); + + for (source_text, options) in &[ + ("", ParserOptions::default()), + ("a", ParserOptions::default()), + ("a+", ParserOptions::default()), + ("a*", ParserOptions::default()), + ("a?", ParserOptions::default()), + ("^$^$^$", ParserOptions::default()), + ("(?=a){1}", ParserOptions::default()), + ("(?!a){1}", ParserOptions::default()), + ("a{1}", ParserOptions::default()), + ("a{1", ParserOptions::default()), + ("a|{", ParserOptions::default()), + ("a{", ParserOptions::default()), + ("a{,", ParserOptions::default()), + ("a{1,", ParserOptions::default()), + ("a{1,}", ParserOptions::default()), + ("a{1,2}", ParserOptions::default()), + ("a|b", ParserOptions::default()), + ("a|b|c", ParserOptions::default()), + ("a|b+?|c", ParserOptions::default()), + ("a+b*?c{1}d{2,}e{3,4}?", ParserOptions::default()), + (r"^(?=ab)\b(?!cd)(?<=ef)\B(?.)\x1f", ParserOptions::default()), + ("a]", ParserOptions::default()), + ("a}", ParserOptions::default()), + ("]", ParserOptions::default()), + ("[]", ParserOptions::default()), + ("[a]", ParserOptions::default()), + ("[ab]", ParserOptions::default()), + ("[a-b]", ParserOptions::default()), + ("[-]", ParserOptions::default()), + ("[a-]", ParserOptions::default()), + ("[-a]", ParserOptions::default()), + ("[-a-]", ParserOptions::default()), + (r"[a\-b]", ParserOptions::default()), + (r"[-a-b]", ParserOptions::default()), + (r"[a-b-]", ParserOptions::default()), + (r"[a\-b-]", ParserOptions::default()), + (r"[\[\]\-]", ParserOptions::default()), + ("[a-z0-9]", ParserOptions::default()), + ("[a-a]", ParserOptions::default()), + (r"[\d-\D]", ParserOptions::default()), + (r"^([\ud801[\udc28-\udc4f])$", ParserOptions::default()), + (r"[a-c]]", ParserOptions::default()), + ( + r"[ϗϙϛϝϟϡϣϥϧϩϫϭϯ-ϳϵϸϻ-ϼа-џѡѣѥѧѩѫѭѯѱѳѵѷѹѻѽѿҁҋҍҏґғҕҗҙқҝҟҡңҥҧҩҫҭүұҳҵҷҹһҽҿӂӄӆӈӊӌӎ-ӏӑӓӕӗәӛӝӟӡӣӥӧөӫӭӯӱӳӵӷӹӻӽӿԁԃԅԇԉԋԍԏԑԓԕԗԙԛԝԟԡԣա-ևᴀ-ᴫᵢ-ᵷᵹ-ᶚḁḃḅḇḉḋḍḏḑḓḕḗḙḛḝḟḡḣḥḧḩḫḭḯḱḳḵḷḹḻḽḿṁṃṅṇṉṋṍṏṑṓṕṗṙṛṝṟṡṣṥṧṩṫṭṯṱṳṵṷṹṻṽṿẁẃẅẇẉẋẍẏẑẓẕ-ẝẟạảấầẩẫậắằẳẵặẹẻẽếềểễệỉịọỏốồổỗộớờởỡợụủứừửữựỳỵỷỹỻỽỿ-ἇἐ-ἕἠ-ἧἰ-ἷὀ-ὅὐ-ὗὠ-ὧὰ]", + ParserOptions::default(), + ), + (r"[a-z0-9[.\\]]", ParserOptions::default().with_unicode_sets_mode()), + (r"[a&&b&&c]", ParserOptions::default().with_unicode_sets_mode()), + (r"[a--b--c]", ParserOptions::default().with_unicode_sets_mode()), + (r"[[a-z]--b--c]", ParserOptions::default().with_unicode_sets_mode()), + ( + r"[[[[[[[[[[[[[[[[[[[[[[[[a]]]]]]]]]]]]]]]]]]]]]]]]", + ParserOptions::default().with_unicode_sets_mode(), + ), + ( + r"[\q{}\q{a}\q{bc}\q{d|e|f}\q{|||}]", + ParserOptions::default().with_unicode_sets_mode(), + ), + (r"(?A)\k", ParserOptions::default()), + (r"(?)\k", ParserOptions::default()), + (r"\k", ParserOptions::default()), + (r"\k<4>", ParserOptions::default()), + (r"\k", ParserOptions::default()), + (r"(?)\k", ParserOptions::default()), + (r"(?)\k", ParserOptions::default().with_unicode_mode()), + (r"\1", ParserOptions::default()), + (r"\1()", ParserOptions::default()), + (r"\1()", ParserOptions::default().with_unicode_mode()), + (r"(?..)(?..)", ParserOptions::default()), + // TODO: ES2025 Duplicate named capturing groups + // (r"(?..)|(?..)", ParserOptions::default()), + // (r"(?[0-9]{4})-[0-9]{2}|[0-9]{2}-(?[0-9]{4})", ParserOptions::default()), + // (r"(?:(?x)|(?y))\k", ParserOptions::default()), + ] { + let res = PatternParser::new(&allocator, source_text, *options).parse(); + if let Err(err) = res { + panic!("Failed to parse {source_text} with {options:?}\n💥 {err}"); + } + } + } + + #[test] + fn should_fail() { + let allocator = Allocator::default(); + + for (source_text, options) in &[ + ("a)", ParserOptions::default()), + (r"a\", ParserOptions::default()), + ("a]", ParserOptions::default().with_unicode_mode()), + ("a}", ParserOptions::default().with_unicode_mode()), + ("a|+", ParserOptions::default()), + ("a|{", ParserOptions::default().with_unicode_mode()), + ("a{", ParserOptions::default().with_unicode_mode()), + ("a{1", ParserOptions::default().with_unicode_mode()), + ("a{1,", ParserOptions::default().with_unicode_mode()), + ("a{,", ParserOptions::default().with_unicode_mode()), + ("(?=a", ParserOptions::default()), + ("(?", ParserOptions::default().with_unicode_mode()), + (r"\k<4>", ParserOptions::default().with_unicode_mode()), + (r"\k", ParserOptions::default().with_unicode_mode()), + ("a(?:", ParserOptions::default()), + ("(a", ParserOptions::default()), + ("(?", ParserOptions::default()), + ("(?)", ParserOptions::default()), + ("(?=a){1}", ParserOptions::default().with_unicode_mode()), + ("(?!a){1}", ParserOptions::default().with_unicode_mode()), + (r"[\d-\D]", ParserOptions::default().with_unicode_mode()), + ("[z-a]", ParserOptions::default()), + (r"[a-c]]", ParserOptions::default().with_unicode_mode()), + ( + r"^([a-zªµºß-öø-ÿāăąćĉċčďđēĕėęěĝğġģĥħĩīĭįıijĵķ-ĸĺļľŀłńņň-ʼnŋōŏőœŕŗřśŝşšţťŧũūŭůűųŵŷźżž-ƀƃƅƈƌ-ƍƒƕƙ-ƛƞơƣƥƨƪ-ƫƭưƴƶƹ-ƺƽ-ƿdžljnjǎǐǒǔǖǘǚǜ-ǝǟǡǣǥǧǩǫǭǯ-ǰdzǵǹǻǽǿȁȃȅȇȉȋȍȏȑȓȕȗșțȝȟȡȣȥȧȩȫȭȯȱȳ-ȹȼȿ-ɀɂɇɉɋɍɏ-ʓʕ-ʯͱͳͷͻ-ͽΐά-ώϐ-ϑϕ-ϗϙϛϝϟϡϣϥϧϩϫϭϯ-ϳϵϸϻ-ϼа-џѡѣѥѧѩѫѭѯѱѳѵѷѹѻѽѿҁҋҍҏґғҕҗҙқҝҟҡңҥҧҩҫҭүұҳҵҷҹһҽҿӂӄӆӈӊӌӎ-ӏӑӓӕӗәӛӝӟӡӣӥӧөӫӭӯӱӳӵӷӹӻӽӿԁԃԅԇԉԋԍԏԑԓԕԗԙԛԝԟԡԣա-ևᴀ-ᴫᵢ-ᵷᵹ-ᶚḁḃḅḇḉḋḍḏḑḓḕḗḙḛḝḟḡḣḥḧḩḫḭḯḱḳḵḷḹḻḽḿṁṃṅṇṉṋṍṏṑṓṕṗṙṛṝṟṡṣṥṧṩṫṭṯṱṳṵṷṹṻṽṿẁẃẅẇẉẋẍẏẑẓẕ-ẝẟạảấầẩẫậắằẳẵặẹẻẽếềểễệỉịọỏốồổỗộớờởỡợụủứừửữựỳỵỷỹỻỽỿ-ἇἐ-ἕἠ-ἧἰ-ἷὀ-ὅὐ-ὗὠ-ὧὰ-ώᾀ-ᾇᾐ-ᾗᾠ-ᾧᾰ-ᾴᾶ-ᾷιῂ-ῄῆ-ῇῐ-ΐῖ-ῗῠ-ῧῲ-ῴῶ-ῷⁱⁿℊℎ-ℏℓℯℴℹℼ-ℽⅆ-ⅉⅎↄⰰ-ⱞⱡⱥ-ⱦⱨⱪⱬⱱⱳ-ⱴⱶ-ⱼⲁⲃⲅⲇⲉⲋⲍⲏⲑⲓⲕⲗⲙⲛⲝⲟⲡⲣⲥⲧⲩⲫⲭⲯⲱⲳⲵⲷⲹⲻⲽⲿⳁⳃⳅⳇⳉⳋⳍⳏⳑⳓⳕⳗⳙⳛⳝⳟⳡⳣ-ⳤⴀ-ⴥꙁꙃꙅꙇꙉꙋꙍꙏꙑꙓꙕꙗꙙꙛꙝꙟꙣꙥꙧꙩꙫꙭꚁꚃꚅꚇꚉꚋꚍꚏꚑꚓꚕꚗꜣꜥꜧꜩꜫꜭꜯ-ꜱꜳꜵꜷꜹꜻꜽꜿꝁꝃꝅꝇꝉꝋꝍꝏꝑꝓꝕꝗꝙꝛꝝꝟꝡꝣꝥꝧꝩꝫꝭꝯꝱ-ꝸꝺꝼꝿꞁꞃꞅꞇꞌff-stﬓ-ﬗa-z]|\ud801[\udc28-\udc4f]|\ud835[\udc1a-\udc33\udc4e-\udc54\udc56-\udc67\udc82-\udc9b\udcb6-\udcb9\udcbb\udcbd-\udcc3\udcc5-\udccf\udcea-\udd03\udd1e-\udd37\udd52-\udd6b\udd86-\udd9f\uddba-\uddd3\uddee-\ude07\ude22-\ude3b\ude56-\ude6f\ude8a-\udea5\udec2-\udeda\udedc-\udee1\udefc-\udf14\udf16-\udf1b\udf36-\udf4e\udf50-\udf55\udf70-\udf88\udf8a-\udf8f\udfaa-\udfc2\udfc4-\udfc9\udfcb])$", + ParserOptions::default(), + ), + (r"[[\d-\D]]", ParserOptions::default().with_unicode_sets_mode()), + (r"[a&&b--c]", ParserOptions::default().with_unicode_sets_mode()), + (r"[a--b&&c]", ParserOptions::default().with_unicode_sets_mode()), + (r"[\q{]", ParserOptions::default().with_unicode_sets_mode()), + (r"[\q{\a}]", ParserOptions::default().with_unicode_sets_mode()), + // TODO: ES2025 Duplicate named capturing groups + (r"(?..)|(?..)", ParserOptions::default()), // This will be valid + // (r"(?|(?))", ParserOptions::default()), // Nested, still invalid + ] { + assert!( + PatternParser::new(&allocator, source_text, *options).parse().is_err(), + "{source_text} should fail to parse with {options:?}!" + ); + } + } + + #[test] + fn should_fail_early_errors() { + let allocator = Allocator::default(); + + for (source_text, options, is_err) in &[ + (r"(?..)(?..)", ParserOptions::default(), true), + (r"a{2,1}", ParserOptions::default(), true), + (r"(?)\k", ParserOptions::default(), true), + (r"()\2", ParserOptions::default().with_unicode_mode(), true), + (r"[a-\d]", ParserOptions::default().with_unicode_mode(), true), + (r"[\d-z]", ParserOptions::default().with_unicode_mode(), true), + (r"[\d-\d]", ParserOptions::default().with_unicode_mode(), true), + (r"[z-a]", ParserOptions::default(), true), + (r"\u{110000}", ParserOptions::default().with_unicode_mode(), true), + (r"(?<\uD800\uDBFF>)", ParserOptions::default(), true), + (r"\u{0}\u{110000}", ParserOptions::default().with_unicode_mode(), true), + (r"(?)", ParserOptions::default(), true), + (r"\p{Foo=Bar}", ParserOptions::default().with_unicode_mode(), true), + (r"\p{Foo}", ParserOptions::default().with_unicode_mode(), true), + (r"\p{Basic_Emoji}", ParserOptions::default().with_unicode_mode(), true), + (r"\P{Basic_Emoji}", ParserOptions::default().with_unicode_sets_mode(), true), + (r"[^\p{Basic_Emoji}]", ParserOptions::default().with_unicode_sets_mode(), true), + (r"[[^\p{Basic_Emoji}]]", ParserOptions::default().with_unicode_sets_mode(), true), + (r"[[^\q{}]]", ParserOptions::default().with_unicode_sets_mode(), true), + (r"[[^\q{ng}]]", ParserOptions::default().with_unicode_sets_mode(), true), + (r"[[^\q{a|}]]", ParserOptions::default().with_unicode_sets_mode(), true), + (r"[[^\q{ng}\q{o|k}]]", ParserOptions::default().with_unicode_sets_mode(), true), + (r"[[^\q{o|k}\q{ng}\q{o|k}]]", ParserOptions::default().with_unicode_sets_mode(), true), + (r"[[^\q{o|k}\q{o|k}\q{ng}]]", ParserOptions::default().with_unicode_sets_mode(), true), + (r"[[^\q{}&&\q{ng}]]", ParserOptions::default().with_unicode_sets_mode(), true), + (r"[[^\q{ng}&&\q{o|k}]]", ParserOptions::default().with_unicode_sets_mode(), false), + ( + r"[[^\q{ng}&&\q{o|k}&&\q{ng}]]", + ParserOptions::default().with_unicode_sets_mode(), + false, + ), + (r"[[^\q{ng}--\q{o|k}]]", ParserOptions::default().with_unicode_sets_mode(), true), + (r"[[^\q{o|k}--\q{ng}]]", ParserOptions::default().with_unicode_sets_mode(), false), + (r"[[z-a]]", ParserOptions::default().with_unicode_sets_mode(), true), + ] { + assert_eq!( + PatternParser::new(&allocator, source_text, *options).parse().is_err(), + *is_err, + "{source_text} should early error with {options:?}!" + ); + } + } + + #[test] + fn should_handle_empty() { + let allocator = Allocator::default(); + let pattern = PatternParser::new(&allocator, "", ParserOptions::default()).parse().unwrap(); + + assert_eq!(pattern.body.body[0].body.len(), 1); + } + + #[test] + fn should_handle_unicode() { + let allocator = Allocator::default(); + let source_text = "このEmoji🥹の数が変わる"; + + for (options, expected) in &[ + (ParserOptions::default(), 15), + (ParserOptions::default().with_unicode_mode(), 14), + (ParserOptions::default().with_unicode_sets_mode(), 14), + ] { + let pattern = PatternParser::new(&allocator, source_text, *options).parse().unwrap(); + assert_eq!(pattern.body.body[0].body.len(), *expected); + } + } +} diff --git a/crates/oxc_regexp_parser/src/body_parser/parser.rs b/crates/oxc_regexp_parser/src/body_parser/parser.rs new file mode 100644 index 0000000000000..4ab0307f7db34 --- /dev/null +++ b/crates/oxc_regexp_parser/src/body_parser/parser.rs @@ -0,0 +1,2092 @@ +use oxc_allocator::{Allocator, Box, Vec}; +use oxc_diagnostics::{OxcDiagnostic, Result}; +use oxc_span::Atom as SpanAtom; + +use crate::{ + ast, + body_parser::{reader::Reader, state::State, unicode, unicode_property}, + options::ParserOptions, + span::SpanFactory, +}; + +pub struct PatternParser<'a> { + allocator: &'a Allocator, + source_text: &'a str, + span_factory: SpanFactory, + reader: Reader<'a>, + state: State<'a>, +} + +impl<'a> PatternParser<'a> { + pub fn new(allocator: &'a Allocator, source_text: &'a str, options: ParserOptions) -> Self { + // `RegExp` can not be empty. + // - Literal `//` means just a single line comment + // - For `new RegExp("")` or `new RegExp()` (= empty), use a placeholder + let source_text = if source_text.is_empty() { "(?:)" } else { source_text }; + + Self { + allocator, + source_text, + span_factory: SpanFactory::new(options.span_offset), + reader: Reader::new(source_text, options.unicode_mode), + state: State::new(options.unicode_mode, options.unicode_sets_mode), + } + } + + pub fn parse(&mut self) -> Result> { + // Pre parse whole pattern to collect: + // - the number of (named|unnamed) capturing groups + // - For `\1` in `\1()` to be handled as indexed reference + // - names of named capturing groups + // - For `\k`, `\k(?)` to be handled as early error in `+NamedCaptureGroups` + // + // NOTE: It means that this perform 2 loops for every cases. + // - Pros: Code is simple enough and easy to understand + // - Cons: 1st pass is completely useless if the pattern does not contain any capturing groups + // We may re-consider this if we need more performance rather than simplicity. + self.state.initialize_with_parsing(self.source_text); + + // [SS:EE] Pattern :: Disjunction + // It is a Syntax Error if CountLeftCapturingParensWithin(Pattern) ≥ 2**32 - 1. + if 2 ^ 32 < self.state.num_of_capturing_groups { + return Err(OxcDiagnostic::error("Too many capturing groups")); + } + // [SS:EE] Pattern :: Disjunction + // It is a Syntax Error if Pattern contains two or more GroupSpecifiers for which the CapturingGroupName of GroupSpecifier is the same. + if self.state.num_of_named_capturing_groups as usize != self.state.found_group_names.len() { + return Err(OxcDiagnostic::error("Duplicated group name")); + } + + let disjunction = self.parse_disjunction()?; + + if self.reader.peek().is_some() { + let span_start = self.reader.offset(); + return Err(OxcDiagnostic::error("Could not parse the entire pattern") + .with_label(self.span_factory.create(span_start, span_start))); + } + + Ok(ast::Pattern { + span: self.span_factory.create(0, self.source_text.len()), + body: disjunction, + }) + } + + // ``` + // Disjunction[UnicodeMode, UnicodeSetsMode, NamedCaptureGroups] :: + // Alternative[?UnicodeMode, ?UnicodeSetsMode, ?NamedCaptureGroups] + // Alternative[?UnicodeMode, ?UnicodeSetsMode, ?NamedCaptureGroups] | Disjunction[?UnicodeMode, ?UnicodeSetsMode, ?NamedCaptureGroups] + // ``` + fn parse_disjunction(&mut self) -> Result> { + let span_start = self.reader.offset(); + + let mut body = Vec::new_in(self.allocator); + loop { + body.push(self.parse_alternative()?); + + if !self.reader.eat('|') { + break; + } + } + + Ok(ast::Disjunction { + span: self.span_factory.create(span_start, self.reader.offset()), + body, + }) + } + + // ``` + // Alternative[UnicodeMode, UnicodeSetsMode, NamedCaptureGroups] :: + // [empty] + // Alternative[?UnicodeMode, ?UnicodeSetsMode, ?NamedCaptureGroups] Term[?UnicodeMode, ?UnicodeSetsMode, ?NamedCaptureGroups] + // ``` + fn parse_alternative(&mut self) -> Result> { + let span_start = self.reader.offset(); + + let mut body = Vec::new_in(self.allocator); + while let Some(term) = self.parse_term()? { + body.push(term); + } + + Ok(ast::Alternative { + span: self.span_factory.create(span_start, self.reader.offset()), + body, + }) + } + + // ``` + // Term[UnicodeMode, UnicodeSetsMode, NamedCaptureGroups] :: + // [+UnicodeMode] Assertion[+UnicodeMode, ?UnicodeSetsMode, ?NamedCaptureGroups] + // [+UnicodeMode] Atom[+UnicodeMode, ?UnicodeSetsMode, ?NamedCaptureGroups] Quantifier + // [+UnicodeMode] Atom[+UnicodeMode, ?UnicodeSetsMode, ?NamedCaptureGroups] + // [~UnicodeMode] QuantifiableAssertion[?NamedCaptureGroups] Quantifier + // [~UnicodeMode] Assertion[~UnicodeMode, ~UnicodeSetsMode, ?NamedCaptureGroups] + // [~UnicodeMode] ExtendedAtom[?NamedCaptureGroups] Quantifier + // [~UnicodeMode] ExtendedAtom[?NamedCaptureGroups] + // ``` + // (Annex B) + fn parse_term(&mut self) -> Result>> { + // [+UnicodeMode] Assertion + // [+UnicodeMode] Atom Quantifier + // [+UnicodeMode] Atom + if self.state.unicode_mode { + if let Some(assertion) = self.parse_assertion()? { + return Ok(Some(assertion)); + } + + let span_start = self.reader.offset(); + return match (self.parse_atom()?, self.consume_quantifier()?) { + (Some(atom), Some(((min, max), greedy))) => { + Ok(Some(ast::Term::Quantifier(Box::new_in( + ast::Quantifier { + span: self.span_factory.create(span_start, self.reader.offset()), + greedy, + min, + max, + body: atom, + }, + self.allocator, + )))) + } + (Some(atom), None) => Ok(Some(atom)), + (None, Some(_)) => { + Err(OxcDiagnostic::error("Lone `Quantifier` found, expected with `Atom`") + .with_label(self.span_factory.create(span_start, self.reader.offset()))) + } + (None, None) => Ok(None), + }; + } + + // [~UnicodeMode] QuantifiableAssertion Quantifier + // [~UnicodeMode] Assertion + // [~UnicodeMode] ExtendedAtom Quantifier + // [~UnicodeMode] ExtendedAtom + let span_start = self.reader.offset(); + if let Some(assertion) = self.parse_assertion()? { + // `QuantifiableAssertion` = (Negative)Lookahead: `(?=...)` or `(?!...)` + if let ast::Term::LookAroundAssertion(look_around) = &assertion { + if matches!( + look_around.kind, + ast::LookAroundAssertionKind::Lookahead + | ast::LookAroundAssertionKind::NegativeLookahead + ) { + if let Some(((min, max), greedy)) = self.consume_quantifier()? { + return Ok(Some(ast::Term::Quantifier(Box::new_in( + ast::Quantifier { + span: self.span_factory.create(span_start, self.reader.offset()), + greedy, + min, + max, + body: assertion, + }, + self.allocator, + )))); + } + } + } + + return Ok(Some(assertion)); + } + + match (self.parse_extended_atom()?, self.consume_quantifier()?) { + (Some(extended_atom), Some(((min, max), greedy))) => { + Ok(Some(ast::Term::Quantifier(Box::new_in( + ast::Quantifier { + span: self.span_factory.create(span_start, self.reader.offset()), + min, + max, + greedy, + body: extended_atom, + }, + self.allocator, + )))) + } + (Some(extended_atom), None) => Ok(Some(extended_atom)), + (None, Some(_)) => { + Err(OxcDiagnostic::error("Lone `Quantifier` found, expected with `ExtendedAtom`") + .with_label(self.span_factory.create(span_start, self.reader.offset()))) + } + (None, None) => Ok(None), + } + } + + // ``` + // Assertion[UnicodeMode, UnicodeSetsMode, NamedCaptureGroups] :: + // ^ + // $ + // \b + // \B + // [+UnicodeMode] (?= Disjunction[+UnicodeMode, ?UnicodeSetsMode, ?NamedCaptureGroups] ) + // [+UnicodeMode] (?! Disjunction[+UnicodeMode, ?UnicodeSetsMode, ?NamedCaptureGroups] ) + // [~UnicodeMode] QuantifiableAssertion[?NamedCaptureGroups] + // (?<= Disjunction[?UnicodeMode, ?UnicodeSetsMode, ?NamedCaptureGroups] ) + // (? Result>> { + let span_start = self.reader.offset(); + + let kind = if self.reader.eat('^') { + Some(ast::BoundaryAssertionKind::Start) + } else if self.reader.eat('$') { + Some(ast::BoundaryAssertionKind::End) + } else if self.reader.eat2('\\', 'b') { + Some(ast::BoundaryAssertionKind::Boundary) + } else if self.reader.eat2('\\', 'B') { + Some(ast::BoundaryAssertionKind::NegativeBoundary) + } else { + None + }; + + if let Some(kind) = kind { + return Ok(Some(ast::Term::BoundaryAssertion(ast::BoundaryAssertion { + span: self.span_factory.create(span_start, self.reader.offset()), + kind, + }))); + } + + let kind = if self.reader.eat3('(', '?', '=') { + Some(ast::LookAroundAssertionKind::Lookahead) + } else if self.reader.eat3('(', '?', '!') { + Some(ast::LookAroundAssertionKind::NegativeLookahead) + } else if self.reader.eat4('(', '?', '<', '=') { + Some(ast::LookAroundAssertionKind::Lookbehind) + } else if self.reader.eat4('(', '?', '<', '!') { + Some(ast::LookAroundAssertionKind::NegativeLookbehind) + } else { + None + }; + + if let Some(kind) = kind { + let disjunction = self.parse_disjunction()?; + + if !self.reader.eat(')') { + return Err(OxcDiagnostic::error("Unterminated lookaround assertion") + .with_label(self.span_factory.create(span_start, self.reader.offset()))); + } + + return Ok(Some(ast::Term::LookAroundAssertion(Box::new_in( + ast::LookAroundAssertion { + span: self.span_factory.create(span_start, self.reader.offset()), + kind, + body: disjunction, + }, + self.allocator, + )))); + } + + Ok(None) + } + + // ``` + // Atom[UnicodeMode, UnicodeSetsMode, NamedCaptureGroups] :: + // PatternCharacter + // . + // \ AtomEscape[?UnicodeMode, ?NamedCaptureGroups] + // CharacterClass[?UnicodeMode, ?UnicodeSetsMode] + // ( GroupSpecifier[?UnicodeMode][opt] Disjunction[?UnicodeMode, ?UnicodeSetsMode, ?NamedCaptureGroups] ) + // (?: Disjunction[?UnicodeMode, ?UnicodeSetsMode, ?NamedCaptureGroups] ) + // ``` + fn parse_atom(&mut self) -> Result>> { + let span_start = self.reader.offset(); + + // PatternCharacter + if let Some(cp) = self.reader.peek().filter(|&cp| !unicode::is_syntax_character(cp)) { + self.reader.advance(); + + return Ok(Some(ast::Term::Character(ast::Character { + span: self.span_factory.create(span_start, self.reader.offset()), + kind: ast::CharacterKind::Symbol, + value: cp, + }))); + } + + // . + if self.reader.eat('.') { + return Ok(Some(ast::Term::Dot(ast::Dot { + span: self.span_factory.create(span_start, self.reader.offset()), + }))); + } + + // \ AtomEscape[?UnicodeMode, ?NamedCaptureGroups] + if self.reader.eat('\\') { + if let Some(atom_escape) = self.parse_atom_escape(span_start)? { + return Ok(Some(atom_escape)); + } + } + + // CharacterClass[?UnicodeMode, ?UnicodeSetsMode] + if let Some(character_class) = self.parse_character_class()? { + return Ok(Some(ast::Term::CharacterClass(Box::new_in( + character_class, + self.allocator, + )))); + } + + // (?: Disjunction[?UnicodeMode, ?UnicodeSetsMode, ?NamedCaptureGroups] ) + if let Some(ignore_group) = self.parse_ignore_group()? { + return Ok(Some(ast::Term::IgnoreGroup(Box::new_in(ignore_group, self.allocator)))); + } + + // ( GroupSpecifier[?UnicodeMode][opt] Disjunction[?UnicodeMode, ?UnicodeSetsMode, ?NamedCaptureGroups] ) + // ( Disjunction[?UnicodeMode, ?UnicodeSetsMode, ?NamedCaptureGroups] ) + if let Some(capturing_group) = self.parse_capturing_group()? { + return Ok(Some(ast::Term::CapturingGroup(Box::new_in( + capturing_group, + self.allocator, + )))); + } + + Ok(None) + } + + // ``` + // ExtendedAtom[NamedCaptureGroups] :: + // . + // \ AtomEscape[~UnicodeMode, ?NamedCaptureGroups] + // \ [lookahead = c] + // CharacterClass[~UnicodeMode, ~UnicodeSetsMode] + // ( GroupSpecifier[~UnicodeMode][opt] Disjunction[~UnicodeMode, ~UnicodeSetsMode, ?NamedCaptureGroups] ) + // (?: Disjunction[~UnicodeMode, ~UnicodeSetsMode, ?NamedCaptureGroups] ) + // InvalidBracedQuantifier + // ExtendedPatternCharacter + // ``` + fn parse_extended_atom(&mut self) -> Result>> { + let span_start = self.reader.offset(); + + // . + if self.reader.eat('.') { + return Ok(Some(ast::Term::Dot(ast::Dot { + span: self.span_factory.create(span_start, self.reader.offset()), + }))); + } + + if self.reader.eat('\\') { + // \ AtomEscape[~UnicodeMode, ?NamedCaptureGroups] + if let Some(atom_escape) = self.parse_atom_escape(span_start)? { + return Ok(Some(atom_escape)); + } + + // \ [lookahead = c] + if self.reader.peek().filter(|&cp| cp == 'c' as u32).is_some() { + return Ok(Some(ast::Term::Character(ast::Character { + span: self.span_factory.create(span_start, self.reader.offset()), + kind: ast::CharacterKind::Symbol, + value: '\\' as u32, + }))); + } + + return Err(OxcDiagnostic::error("Invalid escape") + .with_label(self.span_factory.create(span_start, self.reader.offset()))); + } + + // CharacterClass[~UnicodeMode, ~UnicodeSetsMode] + if let Some(character_class) = self.parse_character_class()? { + return Ok(Some(ast::Term::CharacterClass(Box::new_in( + character_class, + self.allocator, + )))); + } + + // (?: Disjunction[~UnicodeMode, ~UnicodeSetsMode, ?NamedCaptureGroups] ) + if let Some(ignore_group) = self.parse_ignore_group()? { + return Ok(Some(ast::Term::IgnoreGroup(Box::new_in(ignore_group, self.allocator)))); + } + + // ( GroupSpecifier[~UnicodeMode][opt] Disjunction[~UnicodeMode, ~UnicodeSetsMode, ?NamedCaptureGroups] ) + // ( Disjunction[~UnicodeMode, ~UnicodeSetsMode, ?NamedCaptureGroups] ) + if let Some(capturing_group) = self.parse_capturing_group()? { + return Ok(Some(ast::Term::CapturingGroup(Box::new_in( + capturing_group, + self.allocator, + )))); + } + + // InvalidBracedQuantifier + let span_start = self.reader.offset(); + if self.consume_quantifier()?.is_some() { + // [SS:EE] ExtendedAtom :: InvalidBracedQuantifier + // It is a Syntax Error if any source text is matched by this production. + // (Annex B) + return Err(OxcDiagnostic::error("Invalid braced quantifier") + .with_label(self.span_factory.create(span_start, self.reader.offset()))); + } + + // ExtendedPatternCharacter + if let Some(cp) = self.consume_extended_pattern_character() { + return Ok(Some(ast::Term::Character(ast::Character { + span: self.span_factory.create(span_start, self.reader.offset()), + kind: ast::CharacterKind::Symbol, + value: cp, + }))); + } + + Ok(None) + } + + // ``` + // AtomEscape[UnicodeMode, NamedCaptureGroups] :: + // [+UnicodeMode] DecimalEscape + // [~UnicodeMode] DecimalEscape but only if the CapturingGroupNumber of DecimalEscape is ≤ CountLeftCapturingParensWithin(the Pattern containing DecimalEscape) + // CharacterClassEscape[?UnicodeMode] + // CharacterEscape[?UnicodeMode, ?NamedCaptureGroups] + // [+NamedCaptureGroups] k GroupName[?UnicodeMode] + // ``` + // (Annex B) + fn parse_atom_escape(&mut self, span_start: usize) -> Result>> { + let checkpoint = self.reader.checkpoint(); + + // DecimalEscape: \1 means indexed reference + if let Some(index) = self.consume_decimal_escape() { + if self.state.unicode_mode { + // [SS:EE] AtomEscape :: DecimalEscape + // It is a Syntax Error if the CapturingGroupNumber of DecimalEscape is strictly greater than CountLeftCapturingParensWithin(the Pattern containing AtomEscape). + if self.state.num_of_capturing_groups < index { + return Err(OxcDiagnostic::error("Invalid indexed reference") + .with_label(self.span_factory.create(span_start, self.reader.offset()))); + } + + return Ok(Some(ast::Term::IndexedReference(ast::IndexedReference { + span: self.span_factory.create(span_start, self.reader.offset()), + index, + }))); + } + + if index <= self.state.num_of_capturing_groups { + return Ok(Some(ast::Term::IndexedReference(ast::IndexedReference { + span: self.span_factory.create(span_start, self.reader.offset()), + index, + }))); + } + + self.reader.rewind(checkpoint); + } + + // CharacterClassEscape: \d, \p{...} + if let Some(character_class_escape) = self.parse_character_class_escape(span_start) { + return Ok(Some(ast::Term::CharacterClassEscape(character_class_escape))); + } + if let Some(unicode_property_escape) = + self.parse_character_class_escape_unicode(span_start)? + { + return Ok(Some(ast::Term::UnicodePropertyEscape(Box::new_in( + unicode_property_escape, + self.allocator, + )))); + } + + // CharacterEscape: \n, \cM, \0, etc... + if let Some(character_escape) = self.parse_character_escape(span_start)? { + return Ok(Some(ast::Term::Character(character_escape))); + } + + // k GroupName: \k means named reference + if self.state.named_capture_groups && self.reader.eat('k') { + if let Some(name) = self.consume_group_name()? { + // [SS:EE] AtomEscape :: k GroupName + // It is a Syntax Error if GroupSpecifiersThatMatch(GroupName) is empty. + if !self.state.found_group_names.contains(name.as_str()) { + return Err(OxcDiagnostic::error("Group specifier is empty") + .with_label(self.span_factory.create(span_start, self.reader.offset()))); + } + + return Ok(Some(ast::Term::NamedReference(Box::new_in( + ast::NamedReference { + span: self.span_factory.create(span_start, self.reader.offset()), + name, + }, + self.allocator, + )))); + } + + return Err(OxcDiagnostic::error("Invalid named reference") + .with_label(self.span_factory.create(span_start, self.reader.offset()))); + } + + Err(OxcDiagnostic::error("Invalid atom escape") + .with_label(self.span_factory.create(span_start, self.reader.offset()))) + } + + // ``` + // CharacterClassEscape :: + // d + // D + // s + // S + // w + // W + // ``` + fn parse_character_class_escape( + &mut self, + span_start: usize, + ) -> Option { + let kind = if self.reader.eat('d') { + ast::CharacterClassEscapeKind::D + } else if self.reader.eat('D') { + ast::CharacterClassEscapeKind::NegativeD + } else if self.reader.eat('s') { + ast::CharacterClassEscapeKind::S + } else if self.reader.eat('S') { + ast::CharacterClassEscapeKind::NegativeS + } else if self.reader.eat('w') { + ast::CharacterClassEscapeKind::W + } else if self.reader.eat('W') { + ast::CharacterClassEscapeKind::NegativeW + } else { + return None; + }; + + Some(ast::CharacterClassEscape { + span: self.span_factory.create(span_start, self.reader.offset()), + kind, + }) + } + // ``` + // CharacterClassEscape[UnicodeMode] :: + // [+UnicodeMode] p{ UnicodePropertyValueExpression } + // [+UnicodeMode] P{ UnicodePropertyValueExpression } + // ``` + fn parse_character_class_escape_unicode( + &mut self, + span_start: usize, + ) -> Result>> { + if !self.state.unicode_mode { + return Ok(None); + } + + let negative = if self.reader.eat('p') { + false + } else if self.reader.eat('P') { + true + } else { + return Ok(None); + }; + + if self.reader.eat('{') { + if let Some((name, value, is_strings_related)) = + self.consume_unicode_property_value_expression()? + { + if self.reader.eat('}') { + // [SS:EE] CharacterClassEscape :: P{ UnicodePropertyValueExpression } + // It is a Syntax Error if MayContainStrings of the UnicodePropertyValueExpression is true. + // MayContainStrings is true + // - if the UnicodePropertyValueExpression is LoneUnicodePropertyNameOrValue + // - and it is binary property of strings(can be true only with `UnicodeSetsMode`) + if negative && is_strings_related { + return Err(OxcDiagnostic::error( + "Invalid property name(negative + property of strings)", + ) + .with_label(self.span_factory.create(span_start, self.reader.offset()))); + } + + return Ok(Some(ast::UnicodePropertyEscape { + span: self.span_factory.create(span_start, self.reader.offset()), + negative, + strings: is_strings_related, + name, + value, + })); + } + } + } + + Err(OxcDiagnostic::error("Unterminated unicode property escape") + .with_label(self.span_factory.create(span_start, self.reader.offset()))) + } + + // ``` + // CharacterEscape[UnicodeMode, NamedCaptureGroups] :: + // ControlEscape + // c AsciiLetter + // 0 [lookahead ∉ DecimalDigit] + // HexEscapeSequence + // RegExpUnicodeEscapeSequence[?UnicodeMode] + // [~UnicodeMode] LegacyOctalEscapeSequence + // IdentityEscape[?UnicodeMode, ?NamedCaptureGroups] + // ``` + // (Annex B) + fn parse_character_escape(&mut self, span_start: usize) -> Result> { + // e.g. \n + if let Some(cp) = self.reader.peek().and_then(unicode::map_control_escape) { + self.reader.advance(); + + return Ok(Some(ast::Character { + span: self.span_factory.create(span_start, self.reader.offset()), + kind: ast::CharacterKind::SingleEscape, + value: cp, + })); + } + + // e.g. \cM + let checkpoint = self.reader.checkpoint(); + if self.reader.eat('c') { + if let Some(cp) = self.reader.peek().and_then(unicode::map_c_ascii_letter) { + self.reader.advance(); + + return Ok(Some(ast::Character { + span: self.span_factory.create(span_start, self.reader.offset()), + kind: ast::CharacterKind::ControlLetter, + value: cp, + })); + } + self.reader.rewind(checkpoint); + } + + // e.g. \0 + if self.reader.peek().filter(|&cp| cp == '0' as u32).is_some() + && self.reader.peek2().filter(|&cp| unicode::is_decimal_digit(cp)).is_none() + { + self.reader.advance(); + + return Ok(Some(ast::Character { + span: self.span_factory.create(span_start, self.reader.offset()), + kind: ast::CharacterKind::Null, + value: 0x00, + })); + } + + // e.g. \x41 + if self.reader.eat('x') { + if let Some(cp) = self.consume_fixed_hex_digits(2) { + return Ok(Some(ast::Character { + span: self.span_factory.create(span_start, self.reader.offset()), + kind: ast::CharacterKind::HexadecimalEscape, + value: cp, + })); + } + + return Err(OxcDiagnostic::error("Invalid hexadecimal escape") + .with_label(self.span_factory.create(span_start, self.reader.offset()))); + } + + // e.g. \u{1f600} + if let Some(cp) = self.consume_reg_exp_unicode_escape_sequence(self.state.unicode_mode)? { + return Ok(Some(ast::Character { + span: self.span_factory.create(span_start, self.reader.offset()), + kind: ast::CharacterKind::UnicodeEscape, + value: cp, + })); + } + + // e.g. \18 + if !self.state.unicode_mode { + if let Some(cp) = self.consume_legacy_octal_escape_sequence() { + return Ok(Some(ast::Character { + span: self.span_factory.create(span_start, self.reader.offset()), + kind: ast::CharacterKind::Octal, + value: cp, + })); + } + } + + // e.g. \. + if let Some(cp) = self.consume_identity_escape() { + return Ok(Some(ast::Character { + span: self.span_factory.create(span_start, self.reader.offset()), + kind: ast::CharacterKind::Identifier, + value: cp, + })); + } + + Ok(None) + } + + // ``` + // CharacterClass[UnicodeMode, UnicodeSetsMode] :: + // [ [lookahead ≠ ^] ClassContents[?UnicodeMode, ?UnicodeSetsMode] ] + // [^ ClassContents[?UnicodeMode, ?UnicodeSetsMode] ] + // ``` + fn parse_character_class(&mut self) -> Result>> { + let span_start = self.reader.offset(); + + if self.reader.eat('[') { + let negative = self.reader.eat('^'); + let (kind, body) = self.parse_class_contents()?; + + if self.reader.eat(']') { + // [SS:EE] CharacterClass :: [^ ClassContents ] + // It is a Syntax Error if MayContainStrings of the ClassContents is true. + if negative + && body.iter().any(|item| match item { + // MayContainStrings is true + // - if ClassContents contains UnicodePropertyValueExpression + // - and the UnicodePropertyValueExpression is LoneUnicodePropertyNameOrValue + // - and it is binary property of strings(can be true only with `UnicodeSetsMode`) + ast::CharacterClassContents::UnicodePropertyEscape( + unicode_property_escape, + ) => unicode_property_escape.strings, + _ => false, + }) + { + return Err(OxcDiagnostic::error("Invalid character class") + .with_label(self.span_factory.create(span_start, self.reader.offset()))); + } + + return Ok(Some(ast::CharacterClass { + span: self.span_factory.create(span_start, self.reader.offset()), + negative, + kind, + body, + })); + } + + return Err(OxcDiagnostic::error("Unterminated character class") + .with_label(self.span_factory.create(span_start, self.reader.offset()))); + } + + Ok(None) + } + + // ``` + // ClassContents[UnicodeMode, UnicodeSetsMode] :: + // [empty] + // [~UnicodeSetsMode] NonemptyClassRanges[?UnicodeMode] + // [+UnicodeSetsMode] ClassSetExpression + // ``` + fn parse_class_contents( + &mut self, + ) -> Result<(ast::CharacterClassContentsKind, Vec<'a, ast::CharacterClassContents<'a>>)> { + // [empty] + if self.reader.peek().filter(|&cp| cp == ']' as u32).is_some() { + return Ok((ast::CharacterClassContentsKind::Union, Vec::new_in(self.allocator))); + } + + // [+UnicodeSetsMode] ClassSetExpression + if self.state.unicode_sets_mode { + return self.parse_class_set_expression(); + } + + // [~UnicodeSetsMode] NonemptyClassRanges[?UnicodeMode] + self.parse_nonempty_class_ranges() + } + + // ``` + // NonemptyClassRanges[UnicodeMode] :: + // ClassAtom[?UnicodeMode] + // ClassAtom[?UnicodeMode] NonemptyClassRangesNoDash[?UnicodeMode] + // ClassAtom[?UnicodeMode] - ClassAtom[?UnicodeMode] ClassContents[?UnicodeMode, ~UnicodeSetsMode] + // + // NonemptyClassRangesNoDash[UnicodeMode] :: + // ClassAtom[?UnicodeMode] + // ClassAtomNoDash[?UnicodeMode] NonemptyClassRangesNoDash[?UnicodeMode] + // ClassAtomNoDash[?UnicodeMode] - ClassAtom[?UnicodeMode] ClassContents[?UnicodeMode, ~UnicodeSetsMode] + // ``` + fn parse_nonempty_class_ranges( + &mut self, + ) -> Result<(ast::CharacterClassContentsKind, Vec<'a, ast::CharacterClassContents<'a>>)> { + let mut body = Vec::new_in(self.allocator); + + loop { + let range_span_start = self.reader.offset(); + + let Some(class_atom) = self.parse_class_atom()? else { + break; + }; + + let span_start = self.reader.offset(); + if !self.reader.eat('-') { + // ClassAtom[?UnicodeMode] + body.push(class_atom); + continue; + } + + let dash = ast::CharacterClassContents::Character(ast::Character { + span: self.span_factory.create(span_start, self.reader.offset()), + kind: ast::CharacterKind::Symbol, + value: '-' as u32, + }); + + let Some(class_atom_to) = self.parse_class_atom()? else { + // ClassAtom[?UnicodeMode] NonemptyClassRangesNoDash[?UnicodeMode] + // => ClassAtom[?UnicodeMode] ClassAtom[?UnicodeMode] + // => ClassAtom[?UnicodeMode] - + body.push(class_atom); + body.push(dash); + continue; + }; + + // ClassAtom[?UnicodeMode] - ClassAtom[?UnicodeMode] ClassContents[?UnicodeMode, ~UnicodeSetsMode] + // If both sides are characters, it is a range. + if let ( + ast::CharacterClassContents::Character(from), + ast::CharacterClassContents::Character(to), + ) = (&class_atom, &class_atom_to) + { + // [SS:EE] NonemptyClassRanges :: ClassAtom - ClassAtom ClassContents + // [SS:EE] NonemptyClassRangesNoDash :: ClassAtomNoDash - ClassAtom ClassContents + // It is a Syntax Error if IsCharacterClass of the first ClassAtom is false, IsCharacterClass of the second ClassAtom is false, and the CharacterValue of the first ClassAtom is strictly greater than the CharacterValue of the second ClassAtom. + if to.value < from.value { + return Err(OxcDiagnostic::error("Character class range out of order") + .with_label(self.span_factory.create(span_start, self.reader.offset()))); + } + + body.push(ast::CharacterClassContents::CharacterClassRange(Box::new_in( + ast::CharacterClassRange { + span: from.span.merge(&to.span), + min: *from, + max: *to, + }, + self.allocator, + ))); + continue; + } + + // If not, it is just a union of characters. + + // [SS:EE] NonemptyClassRanges :: ClassAtom - ClassAtom ClassContents + // [SS:EE] NonemptyClassRangesNoDash :: ClassAtomNoDash - ClassAtom ClassContents + // It is a Syntax Error if IsCharacterClass of the first ClassAtom is true or IsCharacterClass of the second ClassAtom is true and this production has a [UnicodeMode] parameter. + // (Annex B) + if self.state.unicode_mode { + return Err(OxcDiagnostic::error("Invalid character class range") + .with_label(self.span_factory.create(range_span_start, self.reader.offset()))); + } + + body.push(class_atom); + body.push(dash); + body.push(class_atom_to); + } + + // [empty] is already covered by the caller, but for sure + debug_assert!(!body.is_empty()); + + Ok((ast::CharacterClassContentsKind::Union, body)) + } + + // ``` + // ClassAtom[UnicodeMode] :: + // - + // ClassAtomNoDash[?UnicodeMode] + // ``` + fn parse_class_atom(&mut self) -> Result>> { + let span_start = self.reader.offset(); + + if self.reader.eat('-') { + return Ok(Some(ast::CharacterClassContents::Character(ast::Character { + span: self.span_factory.create(span_start, self.reader.offset()), + kind: ast::CharacterKind::Symbol, + value: '-' as u32, + }))); + } + + self.parse_class_atom_no_dash() + } + + // ``` + // ClassAtomNoDash[UnicodeMode, NamedCaptureGroups] :: + // SourceCharacter but not one of \ or ] or - + // \ ClassEscape[?UnicodeMode, ?NamedCaptureGroups] + // \ [lookahead = c] + // ``` + // (Annex B) + fn parse_class_atom_no_dash(&mut self) -> Result>> { + let span_start = self.reader.offset(); + + if let Some(cp) = self + .reader + .peek() + .filter(|&cp| cp != '\\' as u32 && cp != ']' as u32 && cp != '-' as u32) + { + self.reader.advance(); + + return Ok(Some(ast::CharacterClassContents::Character(ast::Character { + span: self.span_factory.create(span_start, self.reader.offset()), + kind: ast::CharacterKind::Symbol, + value: cp, + }))); + } + + if self.reader.eat('\\') { + if self.reader.peek().filter(|&cp| cp == 'c' as u32).is_some() { + return Ok(Some(ast::CharacterClassContents::Character(ast::Character { + span: self.span_factory.create(span_start, self.reader.offset()), + kind: ast::CharacterKind::Symbol, + value: '\\' as u32, + }))); + } + + if let Some(class_escape) = self.parse_class_escape(span_start)? { + return Ok(Some(class_escape)); + } + + return Err(OxcDiagnostic::error("Invalid class atom") + .with_label(self.span_factory.create(span_start, self.reader.offset()))); + } + + Ok(None) + } + + // ``` + // ClassEscape[UnicodeMode, NamedCaptureGroups] :: + // b + // [+UnicodeMode] - + // [~UnicodeMode] c ClassControlLetter + // CharacterClassEscape[?UnicodeMode] + // CharacterEscape[?UnicodeMode, ?NamedCaptureGroups] + // + // ClassControlLetter :: + // DecimalDigit + // _ + // ``` + // (Annex B) + fn parse_class_escape( + &mut self, + span_start: usize, + ) -> Result>> { + // b + if self.reader.eat('b') { + return Ok(Some(ast::CharacterClassContents::Character(ast::Character { + span: self.span_factory.create(span_start, self.reader.offset()), + kind: ast::CharacterKind::SingleEscape, + value: 0x08, + }))); + } + + // [+UnicodeMode] - + if self.state.unicode_mode && self.reader.eat('-') { + return Ok(Some(ast::CharacterClassContents::Character(ast::Character { + span: self.span_factory.create(span_start, self.reader.offset()), + kind: ast::CharacterKind::Symbol, + value: '-' as u32, + }))); + } + + // [~UnicodeMode] c ClassControlLetter + if !self.state.unicode_mode { + let checkpoint = self.reader.checkpoint(); + + if self.reader.eat('c') { + if let Some(cp) = self + .reader + .peek() + .filter(|&cp| unicode::is_decimal_digit(cp) || cp == '-' as u32) + { + self.reader.advance(); + + return Ok(Some(ast::CharacterClassContents::Character(ast::Character { + span: self.span_factory.create(span_start, self.reader.offset()), + kind: ast::CharacterKind::ControlLetter, + value: cp, + }))); + } + + self.reader.rewind(checkpoint); + } + } + + // CharacterClassEscape[?UnicodeMode] + if let Some(character_class_escape) = self.parse_character_class_escape(span_start) { + return Ok(Some(ast::CharacterClassContents::CharacterClassEscape( + character_class_escape, + ))); + } + if let Some(unicode_property_escape) = + self.parse_character_class_escape_unicode(span_start)? + { + return Ok(Some(ast::CharacterClassContents::UnicodePropertyEscape(Box::new_in( + unicode_property_escape, + self.allocator, + )))); + } + + // CharacterEscape[?UnicodeMode, ?NamedCaptureGroups] + if let Some(character_escape) = self.parse_character_escape(span_start)? { + return Ok(Some(ast::CharacterClassContents::Character(character_escape))); + } + + Ok(None) + } + + // ``` + // ClassSetExpression :: + // ClassUnion + // ClassIntersection + // ClassSubtraction + // ``` + fn parse_class_set_expression( + &mut self, + ) -> Result<(ast::CharacterClassContentsKind, Vec<'a, ast::CharacterClassContents<'a>>)> { + // ClassUnion :: ClassSetRange ClassUnion[opt] + if let Some(class_set_range) = self.parse_class_set_range()? { + return self.parse_class_set_union(class_set_range); + } + + if let Some(class_set_operand) = self.parse_class_set_operand()? { + // ClassIntersection + if self.reader.peek().filter(|&cp| cp == '&' as u32).is_some() + && self.reader.peek2().filter(|&cp| cp == '&' as u32).is_some() + { + return self.parse_class_set_intersection(class_set_operand); + } + // ClassSubtraction + if self.reader.peek().filter(|&cp| cp == '-' as u32).is_some() + && self.reader.peek2().filter(|&cp| cp == '-' as u32).is_some() + { + return self.parse_class_set_subtraction(class_set_operand); + } + + // ClassUnion :: ClassSetOperand ClassUnion[opt] + return self.parse_class_set_union(class_set_operand); + } + + let span_start = self.reader.offset(); + Err(OxcDiagnostic::error("Expected nonempty class set expression") + .with_label(self.span_factory.create(span_start, self.reader.offset()))) + } + + // ``` + // ClassUnion :: + // ClassSetRange ClassUnion[opt] + // ClassSetOperand ClassUnion[opt] + // ``` + fn parse_class_set_union( + &mut self, + class_set_range_or_class_set_operand: ast::CharacterClassContents<'a>, + ) -> Result<(ast::CharacterClassContentsKind, Vec<'a, ast::CharacterClassContents<'a>>)> { + let mut body = Vec::new_in(self.allocator); + body.push(class_set_range_or_class_set_operand); + + loop { + if let Some(class_set_range) = self.parse_class_set_range()? { + body.push(class_set_range); + continue; + } + if let Some(class_set_operand) = self.parse_class_set_operand()? { + body.push(class_set_operand); + continue; + } + + break; + } + + Ok((ast::CharacterClassContentsKind::Union, body)) + } + + // ``` + // ClassIntersection :: + // ClassSetOperand && [lookahead ≠ &] ClassSetOperand + // ClassIntersection && [lookahead ≠ &] ClassSetOperand + // ``` + fn parse_class_set_intersection( + &mut self, + class_set_operand: ast::CharacterClassContents<'a>, + ) -> Result<(ast::CharacterClassContentsKind, Vec<'a, ast::CharacterClassContents<'a>>)> { + let mut body = Vec::new_in(self.allocator); + body.push(class_set_operand); + + loop { + if self.reader.peek().filter(|&cp| cp == ']' as u32).is_some() { + break; + } + + if self.reader.eat2('&', '&') { + let span_start = self.reader.offset(); + if self.reader.eat('&') { + return Err(OxcDiagnostic::error( + "Unexpected `&` inside of class interseciton", // spellchecker:disable-line + ) + .with_label(self.span_factory.create(span_start, self.reader.offset()))); + } + + if let Some(class_set_operand) = self.parse_class_set_operand()? { + body.push(class_set_operand); + continue; + } + } + + let span_start = self.reader.offset(); + return Err(OxcDiagnostic::error( + "Invalid character in character class set interseciton", // spellchecker:disable-line + ) + .with_label(self.span_factory.create(span_start, self.reader.offset()))); + } + + Ok((ast::CharacterClassContentsKind::Intersection, body)) + } + + // ``` + // ClassSubtraction :: + // ClassSetOperand -- ClassSetOperand + // ClassSubtraction -- ClassSetOperand + // ``` + fn parse_class_set_subtraction( + &mut self, + class_set_operand: ast::CharacterClassContents<'a>, + ) -> Result<(ast::CharacterClassContentsKind, Vec<'a, ast::CharacterClassContents<'a>>)> { + let mut body = Vec::new_in(self.allocator); + body.push(class_set_operand); + + loop { + if self.reader.peek().filter(|&cp| cp == ']' as u32).is_some() { + break; + } + + if self.reader.eat2('-', '-') { + if let Some(class_set_operand) = self.parse_class_set_operand()? { + body.push(class_set_operand); + continue; + } + } + + let span_start = self.reader.offset(); + return Err(OxcDiagnostic::error( + "Invalid character in character class set subtraction", + ) + .with_label(self.span_factory.create(span_start, self.reader.offset()))); + } + + Ok((ast::CharacterClassContentsKind::Subtraction, body)) + } + + // ``` + // ClassSetRange :: + // ClassSetCharacter - ClassSetCharacter + // ``` + fn parse_class_set_range(&mut self) -> Result>> { + let checkpoint = self.reader.checkpoint(); + + if let Some(class_set_character) = self.parse_class_set_character()? { + if self.reader.eat('-') { + if let Some(class_set_character_to) = self.parse_class_set_character()? { + // [SS:EE] ClassSetRange :: ClassSetCharacter - ClassSetCharacter + // It is a Syntax Error if the CharacterValue of the first ClassSetCharacter is strictly greater than the CharacterValue of the second ClassSetCharacter. + if class_set_character_to.value < class_set_character.value { + return Err(OxcDiagnostic::error("Character set class range out of order") + .with_label( + class_set_character.span.merge(&class_set_character_to.span), + )); + } + + return Ok(Some(ast::CharacterClassContents::CharacterClassRange( + Box::new_in( + ast::CharacterClassRange { + span: class_set_character.span.merge(&class_set_character_to.span), + min: class_set_character, + max: class_set_character_to, + }, + self.allocator, + ), + ))); + } + } + } + self.reader.rewind(checkpoint); + + Ok(None) + } + + // ``` + // ClassSetOperand :: + // NestedClass + // ClassStringDisjunction + // ClassSetCharacter + // + // ClassStringDisjunction :: + // \q{ ClassStringDisjunctionContents } + // ``` + fn parse_class_set_operand(&mut self) -> Result>> { + if let Some(nested_class) = self.parse_nested_class()? { + return Ok(Some(nested_class)); + } + + let span_start = self.reader.offset(); + if self.reader.eat3('\\', 'q', '{') { + let (class_string_disjunction_contents, strings) = + self.parse_class_string_disjunction_contents()?; + + if self.reader.eat('}') { + return Ok(Some(ast::CharacterClassContents::ClassStringDisjunction(Box::new_in( + ast::ClassStringDisjunction { + span: self.span_factory.create(span_start, self.reader.offset()), + strings, + body: class_string_disjunction_contents, + }, + self.allocator, + )))); + } + + return Err(OxcDiagnostic::error("Unterminated class string disjunction") + .with_label(self.span_factory.create(span_start, self.reader.offset()))); + } + + if let Some(class_set_character) = self.parse_class_set_character()? { + return Ok(Some(ast::CharacterClassContents::Character(class_set_character))); + } + + Ok(None) + } + + // ``` + // NestedClass :: + // [ [lookahead ≠ ^] ClassContents[+UnicodeMode, +UnicodeSetsMode] ] + // [^ ClassContents[+UnicodeMode, +UnicodeSetsMode] ] + // \ CharacterClassEscape[+UnicodeMode] + // ``` + fn parse_nested_class(&mut self) -> Result>> { + let span_start = self.reader.offset(); + + // [ [lookahead ≠ ^] ClassContents[+UnicodeMode, +UnicodeSetsMode] ] + // [^ ClassContents[+UnicodeMode, +UnicodeSetsMode] ] + if self.reader.eat('[') { + let negative = self.reader.eat('^'); + let (kind, body) = self.parse_class_contents()?; + + if self.reader.eat(']') { + // [SS:EE] NestedClass :: [^ ClassContents ] + // It is a Syntax Error if MayContainStrings of the ClassContents is true. + if negative { + let may_contain_strings = |item: &ast::CharacterClassContents| match item { + // MayContainStrings is true + // - if ClassContents contains UnicodePropertyValueExpression + // - && UnicodePropertyValueExpression is LoneUnicodePropertyNameOrValue + // - && it is binary property of strings(can be true only with `UnicodeSetsMode`) + ast::CharacterClassContents::UnicodePropertyEscape( + unicode_property_escape, + ) => unicode_property_escape.strings, + // MayContainStrings is true + // - if ClassStringDisjunction is [empty] + // - || if ClassStringDisjunction contains ClassString + // - && ClassString is [empty] + // - || ClassString contains 2 more ClassSetCharacters + ast::CharacterClassContents::ClassStringDisjunction( + class_string_disjunction, + ) => class_string_disjunction.strings, + _ => false, + }; + + if match kind { + // MayContainStrings is true + // - if ClassContents is ClassUnion + // - && ClassUnion has ClassOperands + // - && at least 1 ClassOperand has MayContainStrings: true + ast::CharacterClassContentsKind::Union => { + body.iter().any(|item| may_contain_strings(item)) + } + // MayContainStrings is true + // - if ClassContents is ClassIntersection + // - && ClassIntersection has ClassOperands + // - && all ClassOperands have MayContainStrings: true + ast::CharacterClassContentsKind::Intersection => { + body.iter().all(|item| may_contain_strings(item)) + } + // MayContainStrings is true + // - if ClassContents is ClassSubtraction + // - && ClassSubtraction has ClassOperands + // - && the first ClassOperand has MayContainStrings: true + ast::CharacterClassContentsKind::Subtraction => { + body.iter().next().map_or(false, |item| may_contain_strings(item)) + } + } { + return Err(OxcDiagnostic::error("Invalid character class").with_label( + self.span_factory.create(span_start, self.reader.offset()), + )); + } + } + + return Ok(Some(ast::CharacterClassContents::NestedCharacterClass(Box::new_in( + ast::CharacterClass { + span: self.span_factory.create(span_start, self.reader.offset()), + negative, + kind, + body, + }, + self.allocator, + )))); + } + + return Err(OxcDiagnostic::error("Unterminated nested class") + .with_label(self.span_factory.create(span_start, self.reader.offset()))); + } + + // \ CharacterClassEscape[+UnicodeMode] + let span_start = self.reader.offset(); + let checkpoint = self.reader.checkpoint(); + if self.reader.eat('\\') { + if let Some(character_class_escape) = self.parse_character_class_escape(span_start) { + return Ok(Some(ast::CharacterClassContents::CharacterClassEscape( + character_class_escape, + ))); + } + if let Some(unicode_property_escape) = + self.parse_character_class_escape_unicode(span_start)? + { + return Ok(Some(ast::CharacterClassContents::UnicodePropertyEscape(Box::new_in( + unicode_property_escape, + self.allocator, + )))); + } + + self.reader.rewind(checkpoint); + } + + Ok(None) + } + + // ``` + // ClassStringDisjunctionContents :: + // ClassString + // ClassString | ClassStringDisjunctionContents + // ``` + // Returns: (ClassStringDisjunctionContents, contain_strings) + fn parse_class_string_disjunction_contents( + &mut self, + ) -> Result<(Vec<'a, ast::ClassString<'a>>, bool)> { + let mut body = Vec::new_in(self.allocator); + let mut strings = false; + + loop { + let (class_string, contain_strings) = self.parse_class_string()?; + body.push(class_string); + if contain_strings { + strings = true; + } + + if !self.reader.eat('|') { + break; + } + } + + if body.is_empty() { + strings = true; + } + + Ok((body, strings)) + } + + // ``` + // ClassString :: + // [empty] + // NonEmptyClassString + // + // NonEmptyClassString :: + // ClassSetCharacter NonEmptyClassString[opt] + // ``` + // Returns (ClassString, contain_strings) + fn parse_class_string(&mut self) -> Result<(ast::ClassString<'a>, bool)> { + let span_start = self.reader.offset(); + + let mut body = Vec::new_in(self.allocator); + while let Some(class_set_character) = self.parse_class_set_character()? { + body.push(class_set_character); + } + + // True if empty or contains 2 or more characters + let contain_strings = body.len() != 1; + + Ok(( + ast::ClassString { + span: self.span_factory.create(span_start, self.reader.offset()), + body, + }, + contain_strings, + )) + } + + // ``` + // ClassSetCharacter :: + // [lookahead ∉ ClassSetReservedDoublePunctuator] SourceCharacter but not ClassSetSyntaxCharacter + // \ CharacterEscape[+UnicodeMode] + // \ ClassSetReservedPunctuator + // \b + // ``` + fn parse_class_set_character(&mut self) -> Result> { + let span_start = self.reader.offset(); + + if let (Some(cp1), Some(cp2)) = (self.reader.peek(), self.reader.peek2()) { + if !unicode::is_class_set_reserved_double_punctuator(cp1, cp2) + && !unicode::is_class_set_syntax_character(cp1) + { + self.reader.advance(); + + return Ok(Some(ast::Character { + span: self.span_factory.create(span_start, self.reader.offset()), + kind: ast::CharacterKind::Symbol, + value: cp1, + })); + } + } + + let checkpoint = self.reader.checkpoint(); + if self.reader.eat('\\') { + if let Some(character_escape) = self.parse_character_escape(span_start)? { + return Ok(Some(character_escape)); + } + + if let Some(cp) = + self.reader.peek().filter(|&cp| unicode::is_class_set_reserved_punctuator(cp)) + { + self.reader.advance(); + return Ok(Some(ast::Character { + span: self.span_factory.create(span_start, self.reader.offset()), + kind: ast::CharacterKind::Identifier, + value: cp, + })); + } + + if self.reader.eat('b') { + return Ok(Some(ast::Character { + span: self.span_factory.create(span_start, self.reader.offset()), + kind: ast::CharacterKind::SingleEscape, + value: 0x08, + })); + } + + self.reader.rewind(checkpoint); + } + + Ok(None) + } + + // ``` + // ( GroupSpecifier[?UnicodeMode][opt] Disjunction[?UnicodeMode, ?UnicodeSetsMode, ?NamedCaptureGroups] ) + // + // GroupSpecifier[UnicodeMode] :: + // ? GroupName[?UnicodeMode] + // ``` + fn parse_capturing_group(&mut self) -> Result>> { + let span_start = self.reader.offset(); + + if self.reader.eat('(') { + let mut group_name = None; + + // GroupSpecifier is optional, but if it exists, `?` is also required + if self.reader.eat('?') { + let Some(name) = self.consume_group_name()? else { + return Err(OxcDiagnostic::error("Capturing group name is missing") + .with_label(self.span_factory.create(span_start, self.reader.offset()))); + }; + group_name = Some(name); + } + + let disjunction = self.parse_disjunction()?; + if self.reader.eat(')') { + return Ok(Some(ast::CapturingGroup { + span: self.span_factory.create(span_start, self.reader.offset()), + name: group_name, + body: disjunction, + })); + } + + return Err(OxcDiagnostic::error("Unterminated capturing group") + .with_label(self.span_factory.create(span_start, self.reader.offset()))); + } + + Ok(None) + } + + // ``` + // (?: Disjunction[?UnicodeMode, ?UnicodeSetsMode, ?NamedCaptureGroups] ) + // ``` + fn parse_ignore_group(&mut self) -> Result>> { + let span_start = self.reader.offset(); + + if self.reader.eat3('(', '?', ':') { + let disjunction = self.parse_disjunction()?; + + if !self.reader.eat(')') { + return Err(OxcDiagnostic::error("Unterminated ignore group") + .with_label(self.span_factory.create(span_start, self.reader.offset()))); + } + + return Ok(Some(ast::IgnoreGroup { + span: self.span_factory.create(span_start, self.reader.offset()), + // TODO: Stage3 ModifierFlags + enabling_modifiers: None, + disabling_modifiers: None, + body: disjunction, + })); + } + + Ok(None) + } + + // --- + + // ``` + // Quantifier :: + // QuantifierPrefix + // QuantifierPrefix ? + // + // QuantifierPrefix :: + // * + // + + // ? + // { DecimalDigits[~Sep] } + // { DecimalDigits[~Sep] ,} + // { DecimalDigits[~Sep] , DecimalDigits[~Sep] } + // ``` + /// Returns: ((min, max), greedy) + #[allow(clippy::type_complexity)] + fn consume_quantifier(&mut self) -> Result), bool)>> { + let is_greedy = |reader: &mut Reader| !reader.eat('?'); + + if self.reader.eat('*') { + return Ok(Some(((0, None), is_greedy(&mut self.reader)))); + } + if self.reader.eat('+') { + return Ok(Some(((1, None), is_greedy(&mut self.reader)))); + } + if self.reader.eat('?') { + return Ok(Some(((0, Some(1)), is_greedy(&mut self.reader)))); + } + + let span_start = self.reader.offset(); + let checkpoint = self.reader.checkpoint(); + if self.reader.eat('{') { + if let Some(min) = self.consume_decimal_digits() { + if self.reader.eat('}') { + return Ok(Some(((min, Some(min)), is_greedy(&mut self.reader)))); + } + + if self.reader.eat(',') { + if self.reader.eat('}') { + return Ok(Some(((min, None), is_greedy(&mut self.reader)))); + } + + if let Some(max) = self.consume_decimal_digits() { + if self.reader.eat('}') { + if max < min { + // [SS:EE] QuantifierPrefix :: { DecimalDigits , DecimalDigits } + // It is a Syntax Error if the MV of the first DecimalDigits is strictly greater than the MV of the second DecimalDigits. + return Err(OxcDiagnostic::error( + "Numbers out of order in braced quantifier", + ) + .with_label( + self.span_factory.create(span_start, self.reader.offset()), + )); + } + + return Ok(Some(((min, Some(max)), is_greedy(&mut self.reader)))); + } + } + } + } + + self.reader.rewind(checkpoint); + } + + Ok(None) + } + + // ``` + // DecimalEscape :: + // NonZeroDigit DecimalDigits[~Sep][opt] [lookahead ∉ DecimalDigit] + // ``` + fn consume_decimal_escape(&mut self) -> Option { + let checkpoint = self.reader.checkpoint(); + + if let Some(index) = self.consume_decimal_digits() { + // \0 is CharacterEscape, not DecimalEscape + if index != 0 { + return Some(index); + } + + self.reader.rewind(checkpoint); + } + + None + } + + // ``` + // DecimalDigits[Sep] :: + // DecimalDigit + // DecimalDigits[?Sep] DecimalDigit + // [+Sep] DecimalDigits[+Sep] NumericLiteralSeparator DecimalDigit + // ``` + // ([Sep] is disabled for `QuantifierPrefix` and `DecimalEscape`, skip it) + fn consume_decimal_digits(&mut self) -> Option { + let checkpoint = self.reader.checkpoint(); + + let mut value = 0; + while let Some(cp) = self.reader.peek().filter(|&cp| unicode::is_decimal_digit(cp)) { + // `- '0' as u32`: convert code point to digit + value = (10 * value) + (cp - '0' as u32); + self.reader.advance(); + } + + if self.reader.checkpoint() != checkpoint { + return Some(value); + } + + None + } + + // ``` + // UnicodePropertyValueExpression :: + // UnicodePropertyName = UnicodePropertyValue + // LoneUnicodePropertyNameOrValue + // ``` + /// Returns: `(name, Option, is_strings_related_unicode_property)` + fn consume_unicode_property_value_expression( + &mut self, + ) -> Result, Option>, bool)>> { + let checkpoint = self.reader.checkpoint(); + + // UnicodePropertyName=UnicodePropertyValue + if let Some(name) = self.consume_unicode_property_name() { + if self.reader.eat('=') { + let span_start = self.reader.offset(); + if let Some(value) = self.consume_unicode_property_value() { + // [SS:EE] UnicodePropertyValueExpression :: UnicodePropertyName = UnicodePropertyValue + // It is a Syntax Error if the source text matched by UnicodePropertyName is not a Unicode property name or property alias listed in the “Property name and aliases” column of Table 65. + // [SS:EE] UnicodePropertyValueExpression :: UnicodePropertyName = UnicodePropertyValue + // It is a Syntax Error if the source text matched by UnicodePropertyValue is not a property value or property value alias for the Unicode property or property alias given by the source text matched by UnicodePropertyName listed in PropertyValueAliases.txt. + if !unicode_property::is_valid_unicode_property(&name, &value) { + return Err(OxcDiagnostic::error("Invalid unicode property name") + .with_label( + self.span_factory.create(span_start, self.reader.offset()), + )); + } + + return Ok(Some((name, Some(value), false))); + } + } + } + self.reader.rewind(checkpoint); + + let span_start = self.reader.offset(); + // LoneUnicodePropertyNameOrValue + if let Some(name_or_value) = self.consume_unicode_property_value() { + // [SS:EE] UnicodePropertyValueExpression :: LoneUnicodePropertyNameOrValue + // It is a Syntax Error if the source text matched by LoneUnicodePropertyNameOrValue is not a Unicode property value or property value alias for the General_Category (gc) property listed in PropertyValueAliases.txt, nor a binary property or binary property alias listed in the “Property name and aliases” column of Table 66, nor a binary property of strings listed in the “Property name” column of Table 67. + if unicode_property::is_valid_unicode_property("General_Category", &name_or_value) { + return Ok(Some(("General_Category".into(), Some(name_or_value), false))); + } + if unicode_property::is_valid_lone_unicode_property(&name_or_value) { + return Ok(Some((name_or_value, None, false))); + } + // [SS:EE] UnicodePropertyValueExpression :: LoneUnicodePropertyNameOrValue + // It is a Syntax Error if the enclosing Pattern does not have a [UnicodeSetsMode] parameter and the source text matched by LoneUnicodePropertyNameOrValue is a binary property of strings listed in the “Property name” column of Table 67. + if unicode_property::is_valid_lone_unicode_property_of_strings(&name_or_value) { + if !self.state.unicode_sets_mode { + return Err(OxcDiagnostic::error( + "`UnicodeSetsMode` is required for binary property of strings", + ) + .with_label(self.span_factory.create(span_start, self.reader.offset()))); + } + + return Ok(Some((name_or_value, None, true))); + } + + return Err(OxcDiagnostic::error("Invalid unicode property name or value") + .with_label(self.span_factory.create(span_start, self.reader.offset()))); + } + + Ok(None) + } + + fn consume_unicode_property_name(&mut self) -> Option> { + let span_start = self.reader.offset(); + + let checkpoint = self.reader.checkpoint(); + while unicode::is_unicode_property_name_character(self.reader.peek()?) { + self.reader.advance(); + } + + if checkpoint == self.reader.checkpoint() { + return None; + } + + Some(SpanAtom::from(&self.source_text[span_start..self.reader.offset()])) + } + + fn consume_unicode_property_value(&mut self) -> Option> { + let span_start = self.reader.offset(); + + let checkpoint = self.reader.checkpoint(); + while unicode::is_unicode_property_value_character(self.reader.peek()?) { + self.reader.advance(); + } + + if checkpoint == self.reader.checkpoint() { + return None; + } + + Some(SpanAtom::from(&self.source_text[span_start..self.reader.offset()])) + } + + // ``` + // GroupName[UnicodeMode] :: + // < RegExpIdentifierName[?UnicodeMode] > + // ``` + fn consume_group_name(&mut self) -> Result>> { + let span_start = self.reader.offset(); + + if !self.reader.eat('<') { + return Ok(None); + } + + if let Some(group_name) = self.consume_reg_exp_idenfigier_name()? { + if self.reader.eat('>') { + return Ok(Some(group_name)); + } + } + + Err(OxcDiagnostic::error("Unterminated capturing group name") + .with_label(self.span_factory.create(span_start, self.reader.offset()))) + } + + // ``` + // RegExpIdentifierName[UnicodeMode] :: + // RegExpIdentifierStart[?UnicodeMode] + // RegExpIdentifierName[?UnicodeMode] RegExpIdentifierPart[?UnicodeMode] + // ``` + fn consume_reg_exp_idenfigier_name(&mut self) -> Result>> { + let span_start = self.reader.offset(); + + if self.consume_reg_exp_idenfigier_start()?.is_some() { + while self.consume_reg_exp_idenfigier_part()?.is_some() {} + + return Ok(Some(SpanAtom::from(&self.source_text[span_start..self.reader.offset()]))); + } + + Ok(None) + } + + // ``` + // RegExpIdentifierStart[UnicodeMode] :: + // IdentifierStartChar + // \ RegExpUnicodeEscapeSequence[+UnicodeMode] + // [~UnicodeMode] UnicodeLeadSurrogate UnicodeTrailSurrogate + // ``` + fn consume_reg_exp_idenfigier_start(&mut self) -> Result> { + if let Some(cp) = self.reader.peek().filter(|&cp| unicode::is_identifier_start_char(cp)) { + self.reader.advance(); + return Ok(Some(cp)); + } + + let span_start = self.reader.offset(); + if self.reader.eat('\\') { + if let Some(cp) = self.consume_reg_exp_unicode_escape_sequence(true)? { + // [SS:EE] RegExpIdentifierStart :: \ RegExpUnicodeEscapeSequence + // It is a Syntax Error if the CharacterValue of RegExpUnicodeEscapeSequence is not the numeric value of some code point matched by the IdentifierStartChar lexical grammar production. + if !unicode::is_identifier_start_char(cp) { + return Err(OxcDiagnostic::error("Invalid unicode escape sequence") + .with_label(self.span_factory.create(span_start, self.reader.offset()))); + } + + return Ok(Some(cp)); + } + } + + if !self.state.unicode_mode { + let span_start = self.reader.offset(); + + if let Some(lead_surrogate) = + self.reader.peek().filter(|&cp| unicode::is_lead_surrogate(cp)) + { + if let Some(trail_surrogate) = + self.reader.peek2().filter(|&cp| unicode::is_trail_surrogate(cp)) + { + self.reader.advance(); + self.reader.advance(); + let cp = unicode::combine_surrogate_pair(lead_surrogate, trail_surrogate); + + // [SS:EE] RegExpIdentifierStart :: UnicodeLeadSurrogate UnicodeTrailSurrogate + // It is a Syntax Error if the RegExpIdentifierCodePoint of RegExpIdentifierStart is not matched by the UnicodeIDStart lexical grammar production. + if !unicode::is_unicode_id_start(cp) { + return Err(OxcDiagnostic::error("Invalid surrogate pair").with_label( + self.span_factory.create(span_start, self.reader.offset()), + )); + } + + return Ok(Some(cp)); + } + } + } + + Ok(None) + } + + // ``` + // RegExpIdentifierPart[UnicodeMode] :: + // IdentifierPartChar + // \ RegExpUnicodeEscapeSequence[+UnicodeMode] + // [~UnicodeMode] UnicodeLeadSurrogate UnicodeTrailSurrogate + // ``` + fn consume_reg_exp_idenfigier_part(&mut self) -> Result> { + if let Some(cp) = self.reader.peek() { + if unicode::is_identifier_part_char(cp) { + self.reader.advance(); + return Ok(Some(cp)); + } + } + + let span_start = self.reader.offset(); + if self.reader.eat('\\') { + if let Some(cp) = self.consume_reg_exp_unicode_escape_sequence(true)? { + // [SS:EE] RegExpIdentifierPart :: \ RegExpUnicodeEscapeSequence + // It is a Syntax Error if the CharacterValue of RegExpUnicodeEscapeSequence is not the numeric value of some code point matched by the IdentifierPartChar lexical grammar production. + if !unicode::is_identifier_part_char(cp) { + return Err(OxcDiagnostic::error("Invalid unicode escape sequence") + .with_label(self.span_factory.create(span_start, self.reader.offset()))); + } + + return Ok(Some(cp)); + } + } + + if !self.state.unicode_mode { + let span_start = self.reader.offset(); + + if let Some(lead_surrogate) = + self.reader.peek().filter(|&cp| unicode::is_lead_surrogate(cp)) + { + if let Some(trail_surrogate) = + self.reader.peek2().filter(|&cp| unicode::is_trail_surrogate(cp)) + { + self.reader.advance(); + self.reader.advance(); + + let cp = unicode::combine_surrogate_pair(lead_surrogate, trail_surrogate); + // [SS:EE] RegExpIdentifierPart :: UnicodeLeadSurrogate UnicodeTrailSurrogate + // It is a Syntax Error if the RegExpIdentifierCodePoint of RegExpIdentifierPart is not matched by the UnicodeIDContinue lexical grammar production. + if !unicode::is_unicode_id_continue(cp) { + return Err(OxcDiagnostic::error("Invalid surrogate pair").with_label( + self.span_factory.create(span_start, self.reader.offset()), + )); + } + + return Ok(Some(cp)); + } + } + } + + Ok(None) + } + + // ``` + // RegExpUnicodeEscapeSequence[UnicodeMode] :: + // [+UnicodeMode] u HexLeadSurrogate \u HexTrailSurrogate + // [+UnicodeMode] u HexLeadSurrogate + // [+UnicodeMode] u HexTrailSurrogate + // [+UnicodeMode] u HexNonSurrogate + // [~UnicodeMode] u Hex4Digits + // [+UnicodeMode] u{ CodePoint } + // ``` + fn consume_reg_exp_unicode_escape_sequence( + &mut self, + unicode_mode: bool, + ) -> Result> { + let span_start = self.reader.offset(); + let checkpoint = self.reader.checkpoint(); + + if self.reader.eat('u') { + if unicode_mode { + let checkpoint = self.reader.checkpoint(); + + // HexLeadSurrogate + HexTrailSurrogate + if let Some(lead_surrogate) = + self.consume_fixed_hex_digits(4).filter(|&cp| unicode::is_lead_surrogate(cp)) + { + if self.reader.eat2('\\', 'u') { + if let Some(trail_surrogate) = self + .consume_fixed_hex_digits(4) + .filter(|&cp| unicode::is_trail_surrogate(cp)) + { + return Ok(Some(unicode::combine_surrogate_pair( + lead_surrogate, + trail_surrogate, + ))); + } + } + } + self.reader.rewind(checkpoint); + + // HexLeadSurrogate + if let Some(lead_surrogate) = + self.consume_fixed_hex_digits(4).filter(|&cp| unicode::is_lead_surrogate(cp)) + { + return Ok(Some(lead_surrogate)); + } + self.reader.rewind(checkpoint); + + // HexTrailSurrogate + if let Some(trail_surrogate) = + self.consume_fixed_hex_digits(4).filter(|&cp| unicode::is_trail_surrogate(cp)) + { + return Ok(Some(trail_surrogate)); + } + self.reader.rewind(checkpoint); + } + + // HexNonSurrogate and Hex4Digits are the same + if let Some(hex_digits) = self.consume_fixed_hex_digits(4) { + return Ok(Some(hex_digits)); + } + + // {CodePoint} + if unicode_mode { + let checkpoint = self.reader.checkpoint(); + + if self.reader.eat('{') { + if let Some(hex_digits) = + self.consume_hex_digits().filter(|&cp| unicode::is_valid_unicode(cp)) + { + if self.reader.eat('}') { + return Ok(Some(hex_digits)); + } + } + } + self.reader.rewind(checkpoint); + } + + if self.state.unicode_mode { + return Err(OxcDiagnostic::error("Invalid unicode escape sequence") + .with_label(self.span_factory.create(span_start, self.reader.offset()))); + } + self.reader.rewind(checkpoint); + } + + Ok(None) + } + + // ``` + // LegacyOctalEscapeSequence :: + // 0 [lookahead ∈ { 8, 9 }] + // NonZeroOctalDigit [lookahead ∉ OctalDigit] + // ZeroToThree OctalDigit [lookahead ∉ OctalDigit] + // FourToSeven OctalDigit + // ZeroToThree OctalDigit OctalDigit + // ``` + fn consume_legacy_octal_escape_sequence(&mut self) -> Option { + if let Some(first) = self.consume_octal_digit() { + // 0 [lookahead ∈ { 8, 9 }] + if first == 0 + && self.reader.peek().filter(|&cp| cp == '8' as u32 || cp == '9' as u32).is_some() + { + return Some(first); + } + + if let Some(second) = self.consume_octal_digit() { + if let Some(third) = self.consume_octal_digit() { + // ZeroToThree OctalDigit OctalDigit + if first <= 3 { + return Some(first * 64 + second * 8 + third); + } + } + + // ZeroToThree OctalDigit [lookahead ∉ OctalDigit] + // FourToSeven OctalDigit + return Some(first * 8 + second); + } + + // NonZeroOctalDigit [lookahead ∉ OctalDigit] + return Some(first); + } + + None + } + + fn consume_octal_digit(&mut self) -> Option { + let cp = self.reader.peek()?; + + if unicode::is_octal_digit(cp) { + self.reader.advance(); + // `- '0' as u32`: convert code point to digit + return Some(cp - '0' as u32); + } + + None + } + + // ``` + // IdentityEscape[UnicodeMode, NamedCaptureGroups] :: + // [+UnicodeMode] SyntaxCharacter + // [+UnicodeMode] / + // [~UnicodeMode] SourceCharacterIdentityEscape[?NamedCaptureGroups] + // + // SourceCharacterIdentityEscape[NamedCaptureGroups] :: + // [~NamedCaptureGroups] SourceCharacter but not c + // [+NamedCaptureGroups] SourceCharacter but not one of c or k + // ``` + // (Annex B) + fn consume_identity_escape(&mut self) -> Option { + let cp = self.reader.peek()?; + + if self.state.unicode_mode { + if unicode::is_syntax_character(cp) || cp == '/' as u32 { + self.reader.advance(); + return Some(cp); + } + return None; + } + + if self.state.named_capture_groups { + if cp != 'c' as u32 && cp != 'k' as u32 { + self.reader.advance(); + return Some(cp); + } + return None; + } + + if cp != 'c' as u32 { + self.reader.advance(); + return Some(cp); + } + + None + } + + // ``` + // ExtendedPatternCharacter :: + // SourceCharacter but not one of ^ $ \ . * + ? ( ) [ | + // ``` + fn consume_extended_pattern_character(&mut self) -> Option { + let cp = self.reader.peek()?; + + if cp == '^' as u32 + || cp == '$' as u32 + || cp == '\\' as u32 + || cp == '.' as u32 + || cp == '*' as u32 + || cp == '+' as u32 + || cp == '?' as u32 + || cp == '(' as u32 + || cp == ')' as u32 + || cp == '[' as u32 + || cp == '|' as u32 + { + return None; + } + + self.reader.advance(); + Some(cp) + } + + fn consume_hex_digits(&mut self) -> Option { + let checkpoint = self.reader.checkpoint(); + + let mut value = 0; + while let Some(hex) = self.reader.peek().and_then(unicode::map_hex_digit) { + value = (16 * value) + hex; + self.reader.advance(); + } + + if self.reader.checkpoint() != checkpoint { + return Some(value); + } + + None + } + + fn consume_fixed_hex_digits(&mut self, len: usize) -> Option { + let checkpoint = self.reader.checkpoint(); + + let mut value = 0; + for _ in 0..len { + let Some(hex) = self.reader.peek().and_then(unicode::map_hex_digit) else { + self.reader.rewind(checkpoint); + return None; + }; + + value = (16 * value) + hex; + self.reader.advance(); + } + + Some(value) + } +} diff --git a/crates/oxc_regexp_parser/src/body_parser/reader.rs b/crates/oxc_regexp_parser/src/body_parser/reader.rs new file mode 100644 index 0000000000000..a9453f46adce9 --- /dev/null +++ b/crates/oxc_regexp_parser/src/body_parser/reader.rs @@ -0,0 +1,247 @@ +pub struct Reader<'a> { + source: &'a str, + unicode_mode: bool, + /// Current index for `u8_units`(unicode mode) or `u16_units`(non-unicode mode). + index: usize, + /// Even in non-unicode mode, used for `Span` offset calculation. + u8_units: Vec<(usize, char)>, + u16_units: Vec, + /// Last offset caches for non-unicode mode. + last_offset_indices: (usize, usize), +} + +impl<'a> Reader<'a> { + pub fn new(source: &'a str, unicode_mode: bool) -> Self { + // NOTE: Distinguish these 2 units looks cleaner, but it may not be necessary. + // As as a parser, AST `Character[kind=Symbol]` only needs to be aware of this for surrogate pairs. + // NOTE: Collecting `Vec` may not be efficient if the source is too large. + // Implements lookahead cache with `VecDeque` is better...? + let u8_units = source.char_indices().collect::>(); + let u16_units = if unicode_mode { "" } else { source }.encode_utf16().collect::>(); + + Self { source, unicode_mode, index: 0, u8_units, u16_units, last_offset_indices: (0, 0) } + } + + pub fn offset(&mut self) -> usize { + if self.unicode_mode { + self.u8_units.get(self.index).map_or(self.source.len(), |(idx, _)| *idx) + } else { + let (mut u16_idx, mut u8_idx) = self.last_offset_indices; + for (idx, ch) in &self.u8_units[u8_idx..] { + if self.index <= u16_idx { + self.last_offset_indices = (u16_idx, u8_idx); + return *idx; + } + + u16_idx += ch.len_utf16(); + u8_idx += 1; + } + self.source.len() + } + } + + // NOTE: For now, `usize` is enough for the checkpoint. + // But `last_offset_indices` should be stored as well for more performance? + pub fn checkpoint(&self) -> usize { + self.index + } + + pub fn rewind(&mut self, checkpoint: usize) { + self.index = checkpoint; + self.last_offset_indices = (0, 0); + } + + pub fn advance(&mut self) { + self.index += 1; + } + + fn peek_nth(&self, n: usize) -> Option { + let nth = self.index + n; + + if self.unicode_mode { + self.u8_units.get(nth).map(|&(_, ch)| ch as u32) + } else { + #[allow(clippy::cast_lossless)] + self.u16_units.get(nth).map(|&cu| cu as u32) + } + } + + pub fn peek(&self) -> Option { + self.peek_nth(0) + } + + pub fn peek2(&self) -> Option { + self.peek_nth(1) + } + + pub fn eat(&mut self, ch: char) -> bool { + if self.peek_nth(0) == Some(ch as u32) { + self.advance(); + return true; + } + false + } + + pub fn eat2(&mut self, ch: char, ch2: char) -> bool { + if self.peek_nth(0) == Some(ch as u32) && self.peek_nth(1) == Some(ch2 as u32) { + self.advance(); + self.advance(); + return true; + } + false + } + + pub fn eat3(&mut self, ch: char, ch2: char, ch3: char) -> bool { + if self.peek_nth(0) == Some(ch as u32) + && self.peek_nth(1) == Some(ch2 as u32) + && self.peek_nth(2) == Some(ch3 as u32) + { + self.advance(); + self.advance(); + self.advance(); + return true; + } + false + } + + pub fn eat4(&mut self, ch: char, ch2: char, ch3: char, ch4: char) -> bool { + if self.peek_nth(0) == Some(ch as u32) + && self.peek_nth(1) == Some(ch2 as u32) + && self.peek_nth(2) == Some(ch3 as u32) + && self.peek_nth(3) == Some(ch4 as u32) + { + self.advance(); + self.advance(); + self.advance(); + self.advance(); + return true; + } + false + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn index_basic() { + let source_text = "/RegExp✨/i"; + let unicode_reader = Reader::new(source_text, true); + let legacy_reader = Reader::new(source_text, false); + + for mut reader in [unicode_reader, legacy_reader] { + assert_eq!(reader.index, 0); + assert_eq!(reader.peek(), Some('/' as u32)); + + reader.advance(); + assert_eq!(reader.index, 1); + assert_eq!(reader.peek(), Some('R' as u32)); + assert_eq!(reader.peek2(), Some('e' as u32)); + + assert!(reader.eat('R')); + assert!(!reader.eat('R')); + assert!(reader.eat('e')); + assert!(reader.eat('g')); + assert!(reader.eat('E')); + assert!(!reader.eat3('E', 'x', 'p')); + assert!(reader.eat2('x', 'p')); + + let checkpoint = reader.checkpoint(); + assert_eq!(checkpoint, 7); + assert_eq!(reader.peek(), Some('✨' as u32)); + + reader.advance(); + reader.advance(); + assert_eq!(reader.peek(), Some('i' as u32)); + + reader.advance(); + assert_eq!(reader.peek(), None); + + reader.rewind(checkpoint); + assert_eq!(reader.peek(), Some('✨' as u32)); + } + } + + #[test] + fn index_unicode() { + let source_text = "𠮷野家は👈🏻あっち"; + + let mut unicode_reader = Reader::new(source_text, true); + + assert!(unicode_reader.eat('𠮷')); // Can eat + assert!(unicode_reader.eat2('野', '家')); + let checkpoint = unicode_reader.checkpoint(); + assert!(unicode_reader.eat('は')); + + // Emoji + Skin tone + unicode_reader.advance(); + unicode_reader.advance(); + + assert!(unicode_reader.eat('あ')); + assert_eq!(unicode_reader.peek(), Some('っ' as u32)); + assert_eq!(unicode_reader.peek2(), Some('ち' as u32)); + + unicode_reader.rewind(checkpoint); + assert!(unicode_reader.eat('は')); + + let mut legacy_reader = Reader::new(source_text, false); + + assert!(!legacy_reader.eat('𠮷')); // Can not eat + legacy_reader.advance(); + assert!(!legacy_reader.eat('𠮷')); // Also can not + legacy_reader.advance(); + + assert!(legacy_reader.eat('野')); + assert!(legacy_reader.eat('家')); + let checkpoint = unicode_reader.checkpoint(); + assert!(legacy_reader.eat('は')); + + legacy_reader.advance(); + legacy_reader.advance(); + legacy_reader.advance(); + legacy_reader.advance(); + + assert_eq!(legacy_reader.peek(), Some('あ' as u32)); + assert_eq!(legacy_reader.peek2(), Some('っ' as u32)); + assert!(legacy_reader.eat3('あ', 'っ', 'ち')); + + legacy_reader.rewind(checkpoint); + assert!(legacy_reader.eat('は')); + } + + #[test] + fn span_position() { + let source_text = "^ Catch😎 @ symbols🇺🇳 $"; + + let unicode_reader = Reader::new(source_text, true); + let legacy_reader = Reader::new(source_text, false); + + for mut reader in [unicode_reader, legacy_reader] { + while reader.peek() != Some('^' as u32) { + reader.advance(); + } + let s1 = reader.offset(); + assert!(reader.eat('^')); + let e1 = reader.offset(); + + while reader.peek() != Some('@' as u32) { + reader.advance(); + } + let s2 = reader.offset(); + assert!(reader.eat('@')); + let e2 = reader.offset(); + + while reader.peek() != Some('$' as u32) { + reader.advance(); + } + let s3 = reader.offset(); + assert!(reader.eat('$')); + let e3 = reader.offset(); + + assert_eq!(&source_text[s1..e1], "^"); + assert_eq!(&source_text[s2..e2], "@"); + assert_eq!(&source_text[s3..e3], "$"); + } + } +} diff --git a/crates/oxc_regexp_parser/src/body_parser/state.rs b/crates/oxc_regexp_parser/src/body_parser/state.rs new file mode 100644 index 0000000000000..cc6f21f134b35 --- /dev/null +++ b/crates/oxc_regexp_parser/src/body_parser/state.rs @@ -0,0 +1,141 @@ +use rustc_hash::FxHashSet; + +use super::reader::Reader; + +/// Currently all of properties are read only from outside of this module. +/// Even inside of this module, it is not changed after initialized. +#[derive(Debug)] +pub struct State<'a> { + // Mode flags + pub unicode_mode: bool, + pub unicode_sets_mode: bool, + pub named_capture_groups: bool, + // Other states + pub num_of_capturing_groups: u32, + pub num_of_named_capturing_groups: u32, + pub found_group_names: FxHashSet<&'a str>, +} + +impl<'a> State<'a> { + pub fn new(unicode_mode: bool, unicode_sets_mode: bool) -> Self { + Self { + unicode_mode, + unicode_sets_mode, + named_capture_groups: false, + num_of_capturing_groups: 0, + num_of_named_capturing_groups: 0, + found_group_names: FxHashSet::default(), + } + } + + pub fn initialize_with_parsing(&mut self, source_text: &'a str) { + let (num_of_left_parens, num_of_named_capturing_groups, named_capturing_groups) = + parse_capturing_groups(source_text); + + // In Annex B, this is `false` by default. + // It is `true` + // - if `u` or `v` flag is set + // - or if `GroupName` is found in pattern + self.named_capture_groups = + self.unicode_mode || self.unicode_sets_mode || 0 < num_of_named_capturing_groups; + + self.num_of_capturing_groups = num_of_left_parens; + self.num_of_named_capturing_groups = num_of_named_capturing_groups; + self.found_group_names = named_capturing_groups; + } +} + +/// Returns: (num_of_left_parens, num_of_named_capturing_groups, named_capturing_groups) +fn parse_capturing_groups(source_text: &str) -> (u32, u32, FxHashSet<&str>) { + let mut num_of_left_parens = 0; + let mut num_of_named_capturing_groups = 0; + let mut named_capturing_groups = FxHashSet::default(); + + let mut reader = Reader::new(source_text, true); + + let mut in_escape = false; + let mut in_character_class = false; + + // Count only normal CapturingGroup(named, unnamed) + // (?...), (...) + // IgnoreGroup, and LookaroundAssertions are ignored + // (?:...) + // (?=...), (?!...), (?<=...), (?' as u32 { + break; + } + reader.advance(); + } + let span_end = reader.offset(); + + if reader.eat('>') { + let group_name = &source_text[span_start..span_end]; + // May be duplicated, but it's OK + named_capturing_groups.insert(group_name); + num_of_named_capturing_groups += 1; + continue; + } + } + } + + reader.advance(); + } + + (num_of_left_parens, num_of_named_capturing_groups, named_capturing_groups) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_count_capturing_groups() { + for (source_text, expected_num_of_left_parens, expected_num_of_named_capturing_groups) in [ + ("()", 1, 0), + (r"\1()", 1, 0), + ("(foo)", 1, 0), + ("(foo)(bar)", 2, 0), + ("(foo(bar))", 2, 0), + ("(foo)[(bar)]", 1, 0), + (r"(foo)\(bar\)", 1, 0), + ("(foo)(?bar)", 2, 1), + ("(foo)(?=...)(?!...)(?<=...)(?bar)(?baz)", 3, 2), + ("(?.)(?..)", 2, 2), + ] { + let (num_of_left_parens, num_of_named_capturing_groups, _) = + parse_capturing_groups(source_text); + assert_eq!(expected_num_of_left_parens, num_of_left_parens); + assert_eq!(expected_num_of_named_capturing_groups, num_of_named_capturing_groups); + } + } +} diff --git a/crates/oxc_regexp_parser/src/body_parser/unicode.rs b/crates/oxc_regexp_parser/src/body_parser/unicode.rs new file mode 100644 index 0000000000000..7a4249b4734f2 --- /dev/null +++ b/crates/oxc_regexp_parser/src/body_parser/unicode.rs @@ -0,0 +1,146 @@ +// ``` +// SyntaxCharacter :: one of +// ^ $ \ . * + ? ( ) [ ] { } | +// ``` +pub fn is_syntax_character(cp: u32) -> bool { + char::from_u32(cp).map_or(false, |ch| { + matches!( + ch, + '^' | '$' | '\\' | '.' | '*' | '+' | '?' | '(' | ')' | '[' | ']' | '{' | '}' | '|' + ) + }) +} + +// ``` +// ClassSetSyntaxCharacter :: one of +// ( ) [ ] { } / - \ | +// ``` +pub fn is_class_set_syntax_character(cp: u32) -> bool { + char::from_u32(cp).map_or(false, |ch| { + matches!(ch, '(' | ')' | '[' | ']' | '{' | '}' | '/' | '-' | '\\' | '|') + }) +} + +// ``` +// ClassSetReservedDoublePunctuator :: one of +// && !! ## $$ %% ** ++ ,, .. :: ;; << == >> ?? @@ ^^ `` ~~ +// ```` +pub fn is_class_set_reserved_double_punctuator(cp1: u32, cp2: u32) -> bool { + char::from_u32(cp1).map_or(false, |ch1| { + char::from_u32(cp2).map_or(false, |ch2| { + matches!( + (ch1, ch2), + ('&', '&') + | ('!', '!') + | ('#', '#') + | ('$', '$') + | ('%', '%') + | ('*', '*') + | ('+', '+') + | (',', ',') + | ('.', '.') + | (':', ':') + | (';', ';') + | ('<', '<') + | ('=', '=') + | ('>', '>') + | ('?', '?') + | ('@', '@') + | ('^', '^') + | ('`', '`') + | ('~', '~') + ) + }) + }) +} + +// ``` +// ClassSetReservedPunctuator :: one of +// & - ! # % , : ; < = > @ ` ~ +// ``` +pub fn is_class_set_reserved_punctuator(cp: u32) -> bool { + char::from_u32(cp).map_or(false, |ch| { + matches!( + ch, + '&' | '-' | '!' | '#' | '%' | ',' | ':' | ';' | '<' | '=' | '>' | '@' | '`' | '~' + ) + }) +} + +pub fn is_decimal_digit(cp: u32) -> bool { + char::from_u32(cp).map_or(false, |ch| ch.is_ascii_digit()) +} + +pub fn is_octal_digit(cp: u32) -> bool { + char::from_u32(cp).map_or(false, |ch| ch.is_ascii_digit() && ch < '8') +} + +pub fn is_valid_unicode(cp: u32) -> bool { + (0..=0x0010_ffff).contains(&cp) +} + +// ``` +// UnicodePropertyNameCharacter :: +// AsciiLetter +// _ +// ``` +pub fn is_unicode_property_name_character(cp: u32) -> bool { + char::from_u32(cp).map_or(false, |ch| ch.is_ascii_alphabetic() || ch == '_') +} + +// ``` +// UnicodePropertyValueCharacter :: +// UnicodePropertyNameCharacter +// DecimalDigit +// ``` +pub fn is_unicode_property_value_character(cp: u32) -> bool { + char::from_u32(cp).map_or(false, |ch| ch.is_ascii_alphanumeric() || ch == '_') +} + +pub fn is_unicode_id_start(cp: u32) -> bool { + char::from_u32(cp).map_or(false, unicode_id_start::is_id_start) +} + +pub fn is_unicode_id_continue(cp: u32) -> bool { + char::from_u32(cp).map_or(false, unicode_id_start::is_id_continue) +} + +pub fn is_identifier_start_char(cp: u32) -> bool { + char::from_u32(cp) + .map_or(false, |ch| unicode_id_start::is_id_start(ch) || ch == '$' || ch == '_') +} + +pub fn is_identifier_part_char(cp: u32) -> bool { + char::from_u32(cp).map_or(false, |ch| unicode_id_start::is_id_continue(ch) || ch == '$') +} + +pub fn is_lead_surrogate(cp: u32) -> bool { + (0xd800..=0xdbff).contains(&cp) +} + +pub fn is_trail_surrogate(cp: u32) -> bool { + (0xdc00..=0xdfff).contains(&cp) +} + +pub fn combine_surrogate_pair(lead: u32, trail: u32) -> u32 { + (lead - 0xd800) * 0x400 + trail - 0xdc00 + 0x10000 +} + +pub fn map_control_escape(cp: u32) -> Option { + match char::from_u32(cp) { + Some('f') => Some(0x0c), + Some('n') => Some(0x0a), + Some('r') => Some(0x0d), + Some('t') => Some(0x09), + Some('v') => Some(0x0b), + _ => None, + } +} + +pub fn map_c_ascii_letter(cp: u32) -> Option { + char::from_u32(cp).filter(char::is_ascii_alphabetic).map(|_| cp % 0x20) +} + +pub fn map_hex_digit(cp: u32) -> Option { + char::from_u32(cp).filter(char::is_ascii_hexdigit).and_then(|c| c.to_digit(16)) +} diff --git a/crates/oxc_regexp_parser/src/body_parser/unicode_property.rs b/crates/oxc_regexp_parser/src/body_parser/unicode_property.rs new file mode 100644 index 0000000000000..65c766580b328 --- /dev/null +++ b/crates/oxc_regexp_parser/src/body_parser/unicode_property.rs @@ -0,0 +1,354 @@ +use phf::{phf_set, Set}; + +// https://tc39.es/ecma262/2024/multipage/text-processing.html#table-nonbinary-unicode-properties +pub fn is_valid_unicode_property(name: &str, value: &str) -> bool { + if matches!(name, "General_Category" | "gc") { + return GC_PROPERTY_VALUES.contains(value); + } + if matches!(name, "Script" | "sc") { + return SC_PROPERTY_VALUES.contains(value); + } + if matches!(name, "Script_Extensions" | "scx") { + return SC_PROPERTY_VALUES.contains(value) || SCX_PROPERTY_VALUES.contains(value); + } + false +} + +pub fn is_valid_lone_unicode_property(name_or_value: &str) -> bool { + BINARY_UNICODE_PROPERTIES.contains(name_or_value) +} +/// This should be used with `UnicodeSetsMode` +pub fn is_valid_lone_unicode_property_of_strings(name_or_value: &str) -> bool { + BINARY_UNICODE_PROPERTIES_OF_STRINGS.contains(name_or_value) +} + +// spellchecker:off +// https://unicode.org/Public/UCD/latest/ucd/PropertyValueAliases.txt +static GC_PROPERTY_VALUES: Set<&'static str> = phf_set! { + "C", "Other", + "Cc", "Control", + "Cf", "Format", + "Cn", "Unassigned", + "Co", "Private_Use", + "Cs", "Surrogate", + "L", "Letter", + "LC", "Cased_Letter", + "Ll", "Lowercase_Letter", + "Lm", "Modifier_Letter", + "Lo", "Other_Letter", + "Lt", "Titlecase_Letter", + "Lu", "Uppercase_Letter", + "M", "Mark", + "Mc", "Spacing_Mark", + "Me", "Enclosing_Mark", + "Mn", "Nonspacing_Mark", + "N", "Number", + "Nd", "Decimal_Number", + "Nl", "Letter_Number", + "No", "Other_Number", + "P", "Punctuation", + "Pc", "Connector_Punctuation", + "Pd", "Dash_Punctuation", + "Pe", "Close_Punctuation", + "Pf", "Final_Punctuation", + "Pi", "Initial_Punctuation", + "Po", "Other_Punctuation", + "Ps", "Open_Punctuation", + "S", "Symbol", + "Sc", "Currency_Symbol", + "Sk", "Modifier_Symbol", + "Sm", "Math_Symbol", + "So", "Other_Symbol", + "Z", "Separator", + "Zl", "Line_Separator", + "Zp", "Paragraph_Separator", + "Zs", "Space_Separator" +}; + +static SC_PROPERTY_VALUES: Set<&'static str> = phf_set! { + "Adlm", "Adlam", + "Aghb", "Caucasian_Albanian", + "Ahom", + "Arab", "Arabic", + "Armi", "Imperial_Aramaic", + "Armn", "Armenian", + "Avst", "Avestan", + "Bali", "Balinese", + "Bamu", "Bamum", + "Bass", "Bassa_Vah", + "Batk", "Batak", + "Beng", "Bengali", + "Bhks", "Bhaiksuki", + "Bopo", "Bopomofo", + "Brah", "Brahmi", + "Brai", "Braille", + "Bugi", "Buginese", + "Buhd", "Buhid", + "Cakm", "Chakma", + "Cans", "Canadian_Aboriginal", + "Cari", "Carian", + "Cham", + "Cher", "Cherokee", + "Chrs", "Chorasmian", + "Copt", "Coptic", + "Cpmn", "Cypro_Minoan", + "Cprt", "Cypriot", + "Cyrl", "Cyrillic", + "Deva", "Devanagari", + "Diak", "Dives_Akuru", + "Dogr", "Dogra", + "Dsrt", "Deseret", + "Dupl", "Duployan", + "Egyp", "Egyptian_Hieroglyphs", + "Elba", "Elbasan", + "Elym", "Elymaic", + "Ethi", "Ethiopic", + "Geor", "Georgian", + "Glag", "Glagolitic", + "Gong", "Gunjala_Gondi", + "Gonm", "Masaram_Gondi", + "Goth", "Gothic", + "Gran", "Grantha", + "Grek", "Greek", + "Gujr", "Gujarati", + "Guru", "Gurmukhi", + "Hang", "Hangul", + "Hani", "Han", + "Hano", "Hanunoo", + "Hatr", "Hatran", + "Hebr", "Hebrew", + "Hira", "Hiragana", + "Hluw", "Anatolian_Hieroglyphs", + "Hmng", "Pahawh_Hmong", + "Hmnp", "Nyiakeng_Puachue_Hmong", + "Hrkt", "Katakana_Or_Hiragana", + "Hung", "Old_Hungarian", + "Ital", "Old_Italic", + "Java", "Javanese", + "Kali", "Kayah_Li", + "Kana", "Katakana", + "Kawi", + "Khar", "Kharoshthi", + "Khmr", "Khmer", + "Khoj", "Khojki", + "Kits", "Khitan_Small_Script", + "Knda", "Kannada", + "Kthi", "Kaithi", + "Lana", "Tai_Tham", + "Laoo", "Lao", + "Latn", "Latin", + "Lepc", "Lepcha", + "Limb", "Limbu", + "Lina", "Linear_A", + "Linb", "Linear_B", + "Lisu", + "Lyci", "Lycian", + "Lydi", "Lydian", + "Mahj", "Mahajani", + "Maka", "Makasar", + "Mand", "Mandaic", + "Mani", "Manichaean", + "Marc", "Marchen", + "Medf", "Medefaidrin", + "Mend", "Mende_Kikakui", + "Merc", "Meroitic_Cursive", + "Mero", "Meroitic_Hieroglyphs", + "Mlym", "Malayalam", + "Modi", + "Mong", "Mongolian", + "Mroo", "Mro", + "Mtei", "Meetei_Mayek", + "Mult", "Multani", + "Mymr", "Myanmar", + "Nagm", "Nag_Mundari", + "Nand", "Nandinagari", + "Narb", "Old_North_Arabian", + "Nbat", "Nabataean", + "Newa", + "Nkoo", "Nko", + "Nshu", "Nushu", + "Ogam", "Ogham", + "Olck", "Ol_Chiki", + "Orkh", "Old_Turkic", + "Orya", "Oriya", + "Osge", "Osage", + "Osma", "Osmanya", + "Ougr", "Old_Uyghur", + "Palm", "Palmyrene", + "Pauc", "Pau_Cin_Hau", + "Perm", "Old_Permic", + "Phag", "Phags_Pa", + "Phli", "Inscriptional_Pahlavi", + "Phlp", "Psalter_Pahlavi", + "Phnx", "Phoenician", + "Plrd", "Miao", + "Prti", "Inscriptional_Parthian", + "Rjng", "Rejang", + "Rohg", "Hanifi_Rohingya", + "Runr", "Runic", + "Samr", "Samaritan", + "Sarb", "Old_South_Arabian", + "Saur", "Saurashtra", + "Sgnw", "SignWriting", + "Shaw", "Shavian", + "Shrd", "Sharada", + "Sidd", "Siddham", + "Sind", "Khudawadi", + "Sinh", "Sinhala", + "Sogd", "Sogdian", + "Sogo", "Old_Sogdian", + "Sora", "Sora_Sompeng", + "Soyo", "Soyombo", + "Sund", "Sundanese", + "Sylo", "Syloti_Nagri", + "Syrc", "Syriac", + "Tagb", "Tagbanwa", + "Takr", "Takri", + "Tale", "Tai_Le", + "Talu", "New_Tai_Lue", + "Taml", "Tamil", + "Tang", "Tangut", + "Tavt", "Tai_Viet", + "Telu", "Telugu", + "Tfng", "Tifinagh", + "Tglg", "Tagalog", + "Thaa", "Thaana", + "Thai", + "Tibt", "Tibetan", + "Tirh", "Tirhuta", + "Tnsa", "Tangsa", + "Toto", + "Ugar", "Ugaritic", + "Vaii", "Vai", + "Vith", "Vithkuqi", + "Wara", "Warang_Citi", + "Wcho", "Wancho", + "Xpeo", "Old_Persian", + "Xsux", "Cuneiform", + "Yezi", "Yezidi", + "Yiii", "Yi", + "Zanb", "Zanabazar_Square", + "Zinh", "Inherited", + "Zyyy", "Common", + "Zzzz", "Unknown" +}; + +static SCX_PROPERTY_VALUES: Set<&'static str> = phf_set! { + // Empty +}; + +// Table 66: Binary Unicode property aliases +// https://tc39.es/ecma262/2024/multipage/text-processing.html#table-binary-unicode-properties +static BINARY_UNICODE_PROPERTIES: Set<&'static str> = phf_set! { + "ASCII", + "ASCII_Hex_Digit", + "AHex", + "Alphabetic", + "Alpha", + "Any", + "Assigned", + "Bidi_Control", + "Bidi_C", + "Bidi_Mirrored", + "Bidi_M", + "Case_Ignorable", + "CI", + "Cased", + "Changes_When_Casefolded", + "CWCF", + "Changes_When_Casemapped", + "CWCM", + "Changes_When_Lowercased", + "CWL", + "Changes_When_NFKC_Casefolded", + "CWKCF", + "Changes_When_Titlecased", + "CWT", + "Changes_When_Uppercased", + "CWU", + "Dash", + "Default_Ignorable_Code_Point", + "DI", + "Deprecated", + "Dep", + "Diacritic", + "Dia", + "Emoji", + "Emoji_Component", + "EComp", + "Emoji_Modifier", + "EMod", + "Emoji_Modifier_Base", + "EBase", + "Emoji_Presentation", + "EPres", + "Extended_Pictographic", + "ExtPict", + "Extender", + "Ext", + "Grapheme_Base", + "Gr_Base", + "Grapheme_Extend", + "Gr_Ext", + "Hex_Digit", + "Hex", + "IDS_Binary_Operator", + "IDSB", + "IDS_Trinary_Operator", + "IDST", + "ID_Continue", + "IDC", + "ID_Start", + "IDS", + "Ideographic", + "Ideo", + "Join_Control", + "Join_C", + "Logical_Order_Exception", + "LOE", + "Lowercase", + "Lower", + "Math", + "Noncharacter_Code_Point", + "NChar", + "Pattern_Syntax", + "Pat_Syn", + "Pattern_White_Space", + "Pat_WS", + "Quotation_Mark", + "QMark", + "Radical", + "Regional_Indicator", + "RI", + "Sentence_Terminal", + "STerm", + "Soft_Dotted", + "SD", + "Terminal_Punctuation", + "Term", + "Unified_Ideograph", + "UIdeo", + "Uppercase", + "Upper", + "Variation_Selector", + "VS", + "White_Space", + "space", + "XID_Continue", + "XIDC", + "XID_Start", + "XIDS", +}; + +// Table 67: Binary Unicode properties of strings +// https://tc39.es/ecma262/2024/multipage/text-processing.html#table-binary-unicode-properties-of-strings +static BINARY_UNICODE_PROPERTIES_OF_STRINGS: Set<&'static str> = phf_set! { + "Basic_Emoji", + "Emoji_Keycap_Sequence", + "RGI_Emoji_Modifier_Sequence", + "RGI_Emoji_Flag_Sequence", + "RGI_Emoji_Tag_Sequence", + "RGI_Emoji_ZWJ_Sequence", + "RGI_Emoji", +}; +// spellchecker:on diff --git a/crates/oxc_regexp_parser/src/flag_parser.rs b/crates/oxc_regexp_parser/src/flag_parser.rs new file mode 100644 index 0000000000000..90f8b0cffdad4 --- /dev/null +++ b/crates/oxc_regexp_parser/src/flag_parser.rs @@ -0,0 +1,69 @@ +use oxc_allocator::Allocator; +use oxc_diagnostics::{OxcDiagnostic, Result}; +use rustc_hash::FxHashSet; + +use crate::{ast, options::ParserOptions, span::SpanFactory}; + +pub struct FlagsParser<'a> { + source_text: &'a str, + // options: ParserOptions, + span_factory: SpanFactory, +} + +impl<'a> FlagsParser<'a> { + pub fn new(_allocator: &'a Allocator, source_text: &'a str, options: ParserOptions) -> Self { + Self { + source_text, + // options, + span_factory: SpanFactory::new(options.span_offset), + } + } + + pub fn parse(&mut self) -> Result { + let mut existing_flags = FxHashSet::default(); + + let mut global = false; + let mut ignore_case = false; + let mut multiline = false; + let mut unicode = false; + let mut sticky = false; + let mut dot_all = false; + let mut has_indices = false; + let mut unicode_sets = false; + + for c in self.source_text.chars() { + if !existing_flags.insert(c) { + return Err(OxcDiagnostic::error(format!("Duplicated flag `{c}`"))); + } + + match c { + 'g' => global = true, + 'i' => ignore_case = true, + 'm' => multiline = true, + 'u' => unicode = true, + 'y' => sticky = true, + 's' => dot_all = true, + 'd' => has_indices = true, + 'v' => unicode_sets = true, + _ => return Err(OxcDiagnostic::error(format!("Invalid flag `{c}`"))), + } + } + + // This should be a `SyntaxError` + if unicode && unicode_sets { + return Err(OxcDiagnostic::error("Invalid regular expression flags")); + } + + Ok(ast::Flags { + span: self.span_factory.create(0, self.source_text.len()), + global, + ignore_case, + multiline, + unicode, + sticky, + dot_all, + has_indices, + unicode_sets, + }) + } +} diff --git a/crates/oxc_regexp_parser/src/lib.rs b/crates/oxc_regexp_parser/src/lib.rs new file mode 100644 index 0000000000000..ae2f1c0a58bc0 --- /dev/null +++ b/crates/oxc_regexp_parser/src/lib.rs @@ -0,0 +1,13 @@ +#![allow(clippy::missing_errors_doc)] + +pub mod ast; +mod body_parser; +mod flag_parser; +mod literal_parser; +mod options; +mod span; + +pub use crate::body_parser::PatternParser; +pub use crate::flag_parser::FlagsParser; +pub use crate::literal_parser::Parser; +pub use crate::options::ParserOptions; diff --git a/crates/oxc_regexp_parser/src/literal_parser.rs b/crates/oxc_regexp_parser/src/literal_parser.rs new file mode 100644 index 0000000000000..64d694aee05ad --- /dev/null +++ b/crates/oxc_regexp_parser/src/literal_parser.rs @@ -0,0 +1,161 @@ +use oxc_allocator::Allocator; +use oxc_diagnostics::{OxcDiagnostic, Result}; + +use crate::{ + ast, body_parser::PatternParser, flag_parser::FlagsParser, options::ParserOptions, + span::SpanFactory, +}; + +/// LiteralParser +pub struct Parser<'a> { + allocator: &'a Allocator, + source_text: &'a str, + options: ParserOptions, + span_factory: SpanFactory, +} + +impl<'a> Parser<'a> { + pub fn new(allocator: &'a Allocator, source_text: &'a str, options: ParserOptions) -> Self { + Self { + allocator, + source_text, + options, + span_factory: SpanFactory::new(options.span_offset), + } + } + + pub fn parse(self) -> Result> { + // Precheck if the source text is a valid regular expression literal + // If valid, parse the pattern and flags with returned span offsets + let (body_start_offset, body_end_offset, flag_start_offset) = + parse_reg_exp_literal(self.source_text)?; + + // Parse flags first to know if unicode mode is enabled or not + let flags = FlagsParser::new( + self.allocator, + &self.source_text[flag_start_offset..], + #[allow(clippy::cast_possible_truncation)] + self.options.with_span_offset(self.options.span_offset + flag_start_offset as u32), + ) + .parse()?; + + // Then parse the pattern with the flags + let pattern_options = match (flags.unicode, flags.unicode_sets) { + (true, false) => self.options.with_unicode_mode(), + (_, true) => self.options.with_unicode_sets_mode(), + _ => self.options, + }; + + let pattern = PatternParser::new( + self.allocator, + &self.source_text[body_start_offset..body_end_offset], + #[allow(clippy::cast_possible_truncation)] + pattern_options.with_span_offset(self.options.span_offset + body_start_offset as u32), + ) + .parse()?; + + Ok(ast::RegExpLiteral { + span: self.span_factory.create(0, self.source_text.len()), + pattern, + flags, + }) + } +} + +/// Check passed source text is a valid regular expression literal. +/// ``` +/// / RegularExpressionBody / RegularExpressionFlags +/// ``` +/// Returns `(body_start_offset, body_end_offset, flag_start_offset)`. +fn parse_reg_exp_literal(source_text: &str) -> Result<(usize, usize, usize)> { + let mut offset = 0; + let mut chars = source_text.chars().peekable(); + + let Some('/') = chars.next() else { + return Err(OxcDiagnostic::error("Unexpected character")); + }; + offset += 1; // '/' + + let body_start = offset; + + let mut in_escape = false; + let mut in_character_class = false; + loop { + match chars.peek() { + // Line terminators are not allowed + Some('\u{a}' | '\u{d}' | '\u{2028}' | '\u{2029}') | None => { + let kind = + if in_character_class { "character class" } else { "regular expression" }; + return Err(OxcDiagnostic::error(format!("Unterminated {kind}"))); + } + Some(&ch) => { + if in_escape { + in_escape = false; + } else if ch == '\\' { + in_escape = true; + } else if ch == '[' { + in_character_class = true; + } else if ch == ']' { + in_character_class = false; + } else if ch == '/' && !in_character_class + // `*` is not allowed as `RegularExpressionFirstChar` + || offset == body_start && ch == '*' + { + break; + } + + offset += ch.len_utf8(); + } + } + + chars.next(); + } + + let Some('/') = chars.next() else { + return Err(OxcDiagnostic::error("Unexpected character")); + }; + let body_end = offset; + + if body_end == body_start { + return Err(OxcDiagnostic::error("Empty")); + } + + Ok((body_start, body_end, body_end + 1)) +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn parse_valid_reg_exp_literal() { + for literal_text in [ + "/(?:)/", + "/abc/", + "/abcd/igsmv", + r"/\w+/u", + r"/foo\/bar|baz/i", + "/[a-z]/", + "/正規表現/u", + "/あっち👈🏻/i", + "/👈🏻こっち/u", + ] { + let (body_start_offset, body_end_offset, flag_start_offset) = + parse_reg_exp_literal(literal_text) + .unwrap_or_else(|_| panic!("{literal_text} should be parsed")); + + let body_text = &literal_text[body_start_offset..body_end_offset]; + let flag_text = &literal_text[flag_start_offset..]; + assert_eq!(format!("/{body_text}/{flag_text}",), literal_text); + } + } + + #[test] + fn parse_invalid_reg_exp_literal() { + for literal_text in + ["", "foo", ":(", "a\nb", "/", "/x", "/y\nz/", "/1[\n]/", "//", "///", "/*abc/", "/\\/"] + { + assert!(parse_reg_exp_literal(literal_text).is_err()); + } + } +} diff --git a/crates/oxc_regexp_parser/src/options.rs b/crates/oxc_regexp_parser/src/options.rs new file mode 100644 index 0000000000000..02280ed3378f1 --- /dev/null +++ b/crates/oxc_regexp_parser/src/options.rs @@ -0,0 +1,25 @@ +#[derive(Clone, Copy, Debug, Default)] +pub struct ParserOptions { + /// Used to adjust Span positions to fit the global source code. + pub span_offset: u32, + /// Unicode mode enabled or not. + pub unicode_mode: bool, + /// Extended Unicode mode enabled or not. + pub unicode_sets_mode: bool, +} + +impl ParserOptions { + #[must_use] + pub fn with_span_offset(self, span_offset: u32) -> ParserOptions { + ParserOptions { span_offset, ..self } + } + + #[must_use] + pub fn with_unicode_mode(self) -> ParserOptions { + ParserOptions { unicode_mode: true, ..self } + } + #[must_use] + pub fn with_unicode_sets_mode(self) -> ParserOptions { + ParserOptions { unicode_mode: true, unicode_sets_mode: true, ..self } + } +} diff --git a/crates/oxc_regexp_parser/src/span.rs b/crates/oxc_regexp_parser/src/span.rs new file mode 100644 index 0000000000000..ed57d43c54159 --- /dev/null +++ b/crates/oxc_regexp_parser/src/span.rs @@ -0,0 +1,16 @@ +use oxc_span::Span; + +pub struct SpanFactory { + span_offset: u32, +} + +impl SpanFactory { + pub fn new(span_offset: u32) -> Self { + Self { span_offset } + } + + #[allow(clippy::cast_possible_truncation)] + pub fn create(&self, start: usize, end: usize) -> Span { + Span::new((start as u32) + self.span_offset, (end as u32) + self.span_offset) + } +}