From 860a4e679d11b666b6d4229c47db9e21c0e23f60 Mon Sep 17 00:00:00 2001 From: Ubugeeei Date: Fri, 17 Nov 2023 10:28:06 +0900 Subject: [PATCH 1/5] init js_regex --- Cargo.lock | 10 + crates/oxc_js_regex/Cargo.toml | 21 ++ crates/oxc_js_regex/README.md | 5 + crates/oxc_js_regex/src/ast.rs | 387 +++++++++++++++++++++++++ crates/oxc_js_regex/src/lexer/mod.rs | 1 + crates/oxc_js_regex/src/lexer/token.rs | 1 + crates/oxc_js_regex/src/lib.rs | 5 + crates/oxc_js_regex/src/parser.rs | 0 crates/oxc_js_regex/src/validator.rs | 0 crates/oxc_js_regex/src/visitor.rs | 0 10 files changed, 430 insertions(+) create mode 100644 crates/oxc_js_regex/Cargo.toml create mode 100644 crates/oxc_js_regex/README.md create mode 100644 crates/oxc_js_regex/src/ast.rs create mode 100644 crates/oxc_js_regex/src/lexer/mod.rs create mode 100644 crates/oxc_js_regex/src/lexer/token.rs create mode 100644 crates/oxc_js_regex/src/lib.rs create mode 100644 crates/oxc_js_regex/src/parser.rs create mode 100644 crates/oxc_js_regex/src/validator.rs create mode 100644 crates/oxc_js_regex/src/visitor.rs diff --git a/Cargo.lock b/Cargo.lock index c07988158702d..6431000ee2c05 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1624,6 +1624,16 @@ dependencies = [ "static_assertions", ] +[[package]] +name = "oxc_js_regex" +version = "0.0.0" +dependencies = [ + "oxc_allocator", + "oxc_index", + "oxc_span", + "oxc_syntax", +] + [[package]] name = "oxc_linter" version = "0.0.0" diff --git a/crates/oxc_js_regex/Cargo.toml b/crates/oxc_js_regex/Cargo.toml new file mode 100644 index 0000000000000..895809c4f7b46 --- /dev/null +++ b/crates/oxc_js_regex/Cargo.toml @@ -0,0 +1,21 @@ +[package] +name = "oxc_js_regex" +version = "0.0.0" +publish = false +authors.workspace = true +categories.workspace = true +description.workspace = true +edition.workspace = true +homepage.workspace = true +keywords.workspace = true +license.workspace = true +repository.workspace = true +rust-version.workspace = true + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +oxc_allocator = { workspace = true } +oxc_span = { workspace = true } +oxc_syntax = { workspace = true } +oxc_index = { workspace = true } diff --git a/crates/oxc_js_regex/README.md b/crates/oxc_js_regex/README.md new file mode 100644 index 0000000000000..f1e6b2fc5370f --- /dev/null +++ b/crates/oxc_js_regex/README.md @@ -0,0 +1,5 @@ +# oxc_js_regex + +⚠️ Work in progress. Do not use yet. + +see: https://github.com/oxc-project/oxc/issues/1164 \ No newline at end of file diff --git a/crates/oxc_js_regex/src/ast.rs b/crates/oxc_js_regex/src/ast.rs new file mode 100644 index 0000000000000..65cf93b30c1b5 --- /dev/null +++ b/crates/oxc_js_regex/src/ast.rs @@ -0,0 +1,387 @@ +//! [`@eslint-community/regexpp`](https://github.com/eslint-community/regexpp/blob/2e8f1af992fb12eae46a446253e8fa3f6cede92a/src/ast.ts) + +use oxc_allocator::{Box, Vec}; +use oxc_span::{Atom, Span}; + +/// The type which includes all nodes. +#[derive(Debug)] +pub enum Node<'a> { + Branch(Box<'a, Branch<'a>>), + Leaf(Box<'a, Leaf<'a>>), +} + +/// The type which includes all branch nodes. +#[derive(Debug)] +pub enum Branch<'a> { + Alternative(Box<'a, Alternative<'a>>), + CapturingGroup(Box<'a, CapturingGroup<'a>>), + CharacterClass(Box<'a, CharacterClass<'a>>), + CharacterClassRange(Box<'a, CharacterClassRange>), + ClassIntersection(Box<'a, ClassIntersection<'a>>), + ClassStringDisjunction(Box<'a, ClassStringDisjunction<'a>>), + ClassSubtraction(Box<'a, ClassSubtraction<'a>>), + ExpressionCharacterClass(Box<'a, ExpressionCharacterClass<'a>>), + Group(Box<'a, Group<'a>>), + LookaroundAssertion(Box<'a, LookaroundAssertion<'a>>), + Pattern(Box<'a, Pattern<'a>>), + Quantifier(Box<'a, Quantifier<'a>>), + RegExpLiteral(Box<'a, RegExpLiteral<'a>>), + StringAlternative(Box<'a, StringAlternative<'a>>), +} + +/// The type which includes all leaf nodes. +#[derive(Debug)] +pub enum Leaf<'a> { + Backreference(Box<'a, Backreference<'a>>), + BoundaryAssertion(Box<'a, BoundaryAssertion<'a>>), + Character(Box<'a, Character>), + CharacterSet(Box<'a, CharacterSet<'a>>), + Flags(Box<'a, Flags>), +} + +/// The type which includes all atom nodes. +#[derive(Debug)] +pub enum Element<'a> { + Assertion(Box<'a, Assertion<'a>>), + QuantifiableElement(Box<'a, QuantifiableElement<'a>>), + Quantifier(Box<'a, Quantifier<'a>>), +} + +/// The type which includes all atom nodes that Quantifier node can have as children. +#[derive(Debug)] +pub enum QuantifiableElement<'a> { + Backreference(Box<'a, Backreference<'a>>), + CapturingGroup(Box<'a, CapturingGroup<'a>>), + Character(Box<'a, Character>), + CharacterClass(Box<'a, CharacterClass<'a>>), + CharacterSet(Box<'a, CharacterSet<'a>>), + ExpressionCharacterClass(Box<'a, ExpressionCharacterClass<'a>>), + Group(Box<'a, Group<'a>>), + LookaheadAssertion(Box<'a, LookaheadAssertion<'a>>), +} + +/// The type which includes all character class atom nodes. +#[derive(Debug)] +pub enum CharacterClassElement<'a> { + ClassRangesCharacterClassElement(Box<'a, ClassRangesCharacterClassElement<'a>>), + UnicodeSetsCharacterClassElement(Box<'a, UnicodeSetsCharacterClassElement<'a>>), +} +#[derive(Debug)] +pub enum ClassRangesCharacterClassElement<'a> { + Character(Box<'a, Character>), + CharacterClassRange(Box<'a, CharacterClassRange>), + CharacterUnicodePropertyCharacterSet(Box<'a, CharacterUnicodePropertyCharacterSet>), + EscapeCharacterSet(Box<'a, EscapeCharacterSet>), +} +#[derive(Debug)] +pub enum UnicodeSetsCharacterClassElement<'a> { + Character(Box<'a, Character>), + CharacterClassRange(Box<'a, CharacterClassRange>), + ClassStringDisjunction(Box<'a, ClassStringDisjunction<'a>>), + EscapeCharacterSet(Box<'a, EscapeCharacterSet>), + ExpressionCharacterClass(Box<'a, ExpressionCharacterClass<'a>>), + UnicodePropertyCharacterSet(Box<'a, UnicodePropertyCharacterSet<'a>>), + UnicodeSetsCharacterClass(Box<'a, UnicodeSetsCharacterClass<'a>>), +} + +/// The root node. +#[derive(Debug)] +pub struct RegExpLiteral<'a> { + pub span: Span, + pub pattern: Pattern<'a>, + pub flags: Flags, +} + +/// The pattern. +#[derive(Debug)] +pub struct Pattern<'a> { + pub span: Span, + pub alternatives: Vec<'a, Alternative<'a>>, +} + +/// The alternative. +/// E.g. `a|b` +#[derive(Debug)] +pub struct Alternative<'a> { + pub span: Span, + pub elements: Vec<'a, Element<'a>>, +} + +/// The uncapturing group. +/// E.g. `(?:ab)` +#[derive(Debug)] +pub struct Group<'a> { + pub span: Span, + pub alternatives: Vec<'a, Alternative<'a>>, +} + +/// The capturing group. +/// E.g. `(ab)`, `(?ab)` +#[derive(Debug)] +pub struct CapturingGroup<'a> { + pub span: Span, + pub name: Option, + pub alternatives: Vec<'a, Alternative<'a>>, + pub references: Vec<'a, Backreference<'a>>, +} + +/// The lookaround assertion. +#[derive(Debug)] +pub enum LookaroundAssertion<'a> { + LookaheadAssertion(Box<'a, LookaheadAssertion<'a>>), + LookbehindAssertion(Box<'a, LookbehindAssertion<'a>>), +} + +/// The lookahead assertion. +/// E.g. `(?=ab)`, `(?!ab)` +#[derive(Debug)] +pub struct LookaheadAssertion<'a> { + pub span: Span, + pub negate: bool, + pub alternatives: Vec<'a, Alternative<'a>>, +} + +/// The lookbehind assertion. +/// E.g. `(?<=ab)`, `(? { + pub span: Span, + pub negate: bool, + pub alternatives: Vec<'a, Alternative<'a>>, +} + +/// The quantifier. +/// E.g. `a?`, `a*`, `a+`, `a{1,2}`, `a??`, `a*?`, `a+?`, `a{1,2}?` +#[derive(Debug)] +pub struct Quantifier<'a> { + pub span: Span, + pub min: f64, + pub max: f64, // can be f64::INFINITY + pub greedy: bool, + pub element: QuantifiableElement<'a>, +} + +/// The character class. +/// E.g. `[ab]`, `[^ab]` +#[derive(Debug)] +pub enum CharacterClass<'a> { + ClassRangesCharacterClass(Box<'a, ClassRangesCharacterClass<'a>>), + UnicodeSetsCharacterClass(Box<'a, UnicodeSetsCharacterClass<'a>>), +} + +/// The character class used in legacy (neither `u` nor `v` flag) and Unicode mode (`u` flag). +/// This character class is guaranteed to **not** contain strings. +/// In Unicode sets mode (`v` flag), {@link UnicodeSetsCharacterClass} is used. +#[derive(Debug)] +pub struct ClassRangesCharacterClass<'a> { + pub span: Span, + pub unicode_sets: bool, + pub elements: Vec<'a, ClassRangesCharacterClassElement<'a>>, +} + +/// The character class used in Unicode sets mode (`v` flag). +/// This character class may contain strings. +#[derive(Debug)] +pub struct UnicodeSetsCharacterClass<'a> { + pub span: Span, + pub elements: Vec<'a, UnicodeSetsCharacterClassElement<'a>>, +} + +/// The character class. +/// E.g. `[a-b]` +#[derive(Debug)] +pub struct CharacterClassRange { + pub span: Span, + pub min: Character, + pub max: Character, +} + +/// The assertion. +#[derive(Debug)] +pub enum Assertion<'a> { + BoundaryAssertion(Box<'a, BoundaryAssertion<'a>>), + LookaroundAssertion(Box<'a, LookaroundAssertion<'a>>), +} + +/// The boundary assertion. +#[derive(Debug)] +pub enum BoundaryAssertion<'a> { + EdgeAssertion(Box<'a, EdgeAssertion>), + WordBoundaryAssertion(Box<'a, WordBoundaryAssertion>), +} + +/// The edge boundary assertion. +/// E.g. `^`, `$` +#[derive(Debug)] +pub struct EdgeAssertion { + pub span: Span, + pub kind: EdgeAssertionKind, +} + +#[derive(Debug)] +pub enum EdgeAssertionKind { + Start, + End, +} + +/// The word bondary assertion. +/// E.g. `\b`, `\B` +#[derive(Debug)] +pub struct WordBoundaryAssertion { + pub span: Span, + pub negate: bool, +} + +/// The character set. +#[derive(Debug)] +pub enum CharacterSet<'a> { + AnyCharacterSet, + EscapeCharacterSet(Box<'a, EscapeCharacterSet>), + UnicodePropertyCharacterSet(Box<'a, UnicodePropertyCharacterSet<'a>>), +} + +/// The character class escape. +/// E.g. `\d`, `\s`, `\w`, `\D`, `\S`, `\W` +#[derive(Debug)] +pub struct EscapeCharacterSet { + pub span: Span, + pub kind: EscapeCharacterSetKind, + pub negate: bool, +} + +#[derive(Debug)] +pub enum EscapeCharacterSetKind { + Digit, + Space, + Word, +} + +/// The unicode property escape. +/// E.g. `\p{ASCII}`, `\P{ASCII}`, `\p{Script=Hiragana}` +#[derive(Debug)] +pub enum UnicodePropertyCharacterSet<'a> { + CharacterUnicodePropertyCharacterSet(Box<'a, CharacterUnicodePropertyCharacterSet>), + StringsUnicodePropertyCharacterSet(Box<'a, StringsUnicodePropertyCharacterSet>), +} + +#[derive(Debug)] +pub struct CharacterUnicodePropertyCharacterSet { + pub span: Span, + pub key: Atom, + pub value: Option, + pub negate: bool, +} + +/// StringsUnicodePropertyCharacterSet is Unicode property escape with property of strings. +#[derive(Debug)] +pub struct StringsUnicodePropertyCharacterSet { + pub span: Span, + pub key: Atom, +} + +/// The expression character class. +/// E.g. `[a--b]`, `[a&&b]`,`[^a--b]`, `[^a&&b]` +#[derive(Debug)] +pub struct ExpressionCharacterClass<'a> { + pub span: Span, + pub negate: bool, + pub expression: ExpressionCharacterClassExpr<'a>, +} + +#[derive(Debug)] +pub enum ExpressionCharacterClassExpr<'a> { + ClassIntersection(Box<'a, ClassIntersection<'a>>), + ClassSubtraction(Box<'a, ClassSubtraction<'a>>), +} + +#[derive(Debug)] +pub enum ClassSetOperand<'a> { + Character(Box<'a, Character>), + ClassStringDisjunction(Box<'a, ClassStringDisjunction<'a>>), + EscapeCharacterSet(Box<'a, EscapeCharacterSet>), + ExpressionCharacterClass(Box<'a, ExpressionCharacterClass<'a>>), + UnicodePropertyCharacterSet(Box<'a, UnicodePropertyCharacterSet<'a>>), + UnicodeSetsCharacterClass(Box<'a, UnicodeSetsCharacterClass<'a>>), +} + +/// The character class intersection. +/// E.g. `a&&b` +#[derive(Debug)] +pub struct ClassIntersection<'a> { + pub span: Span, + pub left: ClassIntersectionLeft<'a>, + pub right: ClassSetOperand<'a>, +} + +#[derive(Debug)] +pub enum ClassIntersectionLeft<'a> { + ClassIntersection(Box<'a, ClassIntersection<'a>>), + ClassSetOperand(Box<'a, ClassSetOperand<'a>>), +} + +/// The character class subtraction. +/// E.g. `a--b` +#[derive(Debug)] +pub struct ClassSubtraction<'a> { + pub span: Span, + pub left: ClassSubtractionLeft<'a>, + pub right: ClassSetOperand<'a>, +} + +#[derive(Debug)] +pub enum ClassSubtractionLeft<'a> { + ClassSetOperand(Box<'a, ClassSetOperand<'a>>), + ClassSubtraction(Box<'a, ClassSubtraction<'a>>), +} + +/// The character class string disjunction. +/// E.g. `\q{a|b}` +#[derive(Debug)] +pub struct ClassStringDisjunction<'a> { + pub span: Span, + pub alternatives: Vec<'a, StringAlternative<'a>>, +} + +/// StringAlternative is only used for `\q{alt}`({@link ClassStringDisjunction}). +#[derive(Debug)] +pub struct StringAlternative<'a> { + pub span: Span, + pub elements: Vec<'a, Character>, +} + +/// This includes escape sequences which mean a character. +/// E.g. `a`, `あ`, `✿`, `\x65`, `\u0065`, `\u{65}`, `\/` +#[derive(Debug)] +pub struct Character { + pub span: Span, + pub value: u16, // UTF-16 code point +} + +#[derive(Debug)] +pub enum BackreferenceRef { + Number(i32), // FIXME: + Atom(Atom), +} + +/// The backreference. +/// E.g. `\1`, `\k` +#[derive(Debug)] +pub struct Backreference<'a> { + pub span: Span, + pub reference: BackreferenceRef, + pub resolved: CapturingGroup<'a>, +} + +/// The flags. +#[derive(Debug)] +pub struct Flags { + pub span: Span, + pub dot_all: bool, + pub global: bool, + pub has_indices: bool, + pub ignore_case: bool, + pub multiline: bool, + pub sticky: bool, + pub unicode: bool, + pub unicode_sets: bool, +} diff --git a/crates/oxc_js_regex/src/lexer/mod.rs b/crates/oxc_js_regex/src/lexer/mod.rs new file mode 100644 index 0000000000000..40d3ff585686a --- /dev/null +++ b/crates/oxc_js_regex/src/lexer/mod.rs @@ -0,0 +1 @@ +mod token; diff --git a/crates/oxc_js_regex/src/lexer/token.rs b/crates/oxc_js_regex/src/lexer/token.rs new file mode 100644 index 0000000000000..8b137891791fe --- /dev/null +++ b/crates/oxc_js_regex/src/lexer/token.rs @@ -0,0 +1 @@ + diff --git a/crates/oxc_js_regex/src/lib.rs b/crates/oxc_js_regex/src/lib.rs new file mode 100644 index 0000000000000..6647fb03be8f5 --- /dev/null +++ b/crates/oxc_js_regex/src/lib.rs @@ -0,0 +1,5 @@ +pub mod ast; +mod lexer; +pub mod parser; +pub mod validator; +pub mod visitor; diff --git a/crates/oxc_js_regex/src/parser.rs b/crates/oxc_js_regex/src/parser.rs new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/crates/oxc_js_regex/src/validator.rs b/crates/oxc_js_regex/src/validator.rs new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/crates/oxc_js_regex/src/visitor.rs b/crates/oxc_js_regex/src/visitor.rs new file mode 100644 index 0000000000000..e69de29bb2d1d From 664e32b4628f5b9ae62f4cb5fe76e6397032fb7e Mon Sep 17 00:00:00 2001 From: Ubugeeei Date: Wed, 22 Nov 2023 22:55:05 +0900 Subject: [PATCH 2/5] remove: dead comment --- crates/oxc_js_regex/src/ast.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/oxc_js_regex/src/ast.rs b/crates/oxc_js_regex/src/ast.rs index a32b08caa3f8e..6690c5f7386fc 100644 --- a/crates/oxc_js_regex/src/ast.rs +++ b/crates/oxc_js_regex/src/ast.rs @@ -359,7 +359,7 @@ pub struct Character { #[derive(Debug)] pub enum BackreferenceRef { - Number(i32), // FIXME: + Number(i32), Atom(Atom), } From f3981e627cc952e018b9120f0c541ed4359771ad Mon Sep 17 00:00:00 2001 From: Ubugeeei Date: Wed, 22 Nov 2023 23:14:43 +0900 Subject: [PATCH 3/5] remove: unused dependencies --- Cargo.lock | 2 -- crates/oxc_js_regex/Cargo.toml | 2 -- 2 files changed, 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3fd4bec9ccb4b..726086bb882df 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1631,9 +1631,7 @@ name = "oxc_js_regex" version = "0.0.0" dependencies = [ "oxc_allocator", - "oxc_index", "oxc_span", - "oxc_syntax", ] [[package]] diff --git a/crates/oxc_js_regex/Cargo.toml b/crates/oxc_js_regex/Cargo.toml index 895809c4f7b46..ab3df9dde38be 100644 --- a/crates/oxc_js_regex/Cargo.toml +++ b/crates/oxc_js_regex/Cargo.toml @@ -17,5 +17,3 @@ rust-version.workspace = true [dependencies] oxc_allocator = { workspace = true } oxc_span = { workspace = true } -oxc_syntax = { workspace = true } -oxc_index = { workspace = true } From b72b5be477393f7995c036d1ace2b397ef8b67fb Mon Sep 17 00:00:00 2001 From: Ubugeeei Date: Thu, 23 Nov 2023 00:27:02 +0900 Subject: [PATCH 4/5] chore: configure lints and doctest --- crates/oxc_js_regex/Cargo.toml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/crates/oxc_js_regex/Cargo.toml b/crates/oxc_js_regex/Cargo.toml index ab3df9dde38be..ff757bddfb9d2 100644 --- a/crates/oxc_js_regex/Cargo.toml +++ b/crates/oxc_js_regex/Cargo.toml @@ -12,7 +12,11 @@ license.workspace = true repository.workspace = true rust-version.workspace = true -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html +[lints] +workspace = true + +[lib] +doctest = false [dependencies] oxc_allocator = { workspace = true } From 35edcb7182438483fbcf3496358a2e2ace737fbc Mon Sep 17 00:00:00 2001 From: Ubugeeei Date: Thu, 23 Nov 2023 00:29:27 +0900 Subject: [PATCH 5/5] chore: oxc_js_regex author --- crates/oxc_js_regex/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/oxc_js_regex/Cargo.toml b/crates/oxc_js_regex/Cargo.toml index ff757bddfb9d2..38dcac20c5f06 100644 --- a/crates/oxc_js_regex/Cargo.toml +++ b/crates/oxc_js_regex/Cargo.toml @@ -2,7 +2,7 @@ name = "oxc_js_regex" version = "0.0.0" publish = false -authors.workspace = true +authors = ["Ubugeeei "] categories.workspace = true description.workspace = true edition.workspace = true