diff --git a/crates/oxc_regular_expression/src/body_parser/parser.rs b/crates/oxc_regular_expression/src/body_parser/parser.rs index 3a1e008cd58c5..1544cb4a4948e 100644 --- a/crates/oxc_regular_expression/src/body_parser/parser.rs +++ b/crates/oxc_regular_expression/src/body_parser/parser.rs @@ -7,6 +7,7 @@ use crate::{ body_parser::{diagnostics, reader::Reader, state::State, unicode, unicode_property}, options::ParserOptions, span::SpanFactory, + surroage_pair, }; pub struct PatternParser<'a> { @@ -1847,14 +1848,14 @@ impl<'a> PatternParser<'a> { let span_start = self.reader.offset(); if let Some(lead_surrogate) = - self.reader.peek().filter(|&cp| unicode::is_lead_surrogate(cp)) + self.reader.peek().filter(|&cp| surroage_pair::is_lead_surrogate(cp)) { if let Some(trail_surrogate) = - self.reader.peek2().filter(|&cp| unicode::is_trail_surrogate(cp)) + self.reader.peek2().filter(|&cp| surroage_pair::is_trail_surrogate(cp)) { self.reader.advance(); self.reader.advance(); - let cp = unicode::combine_surrogate_pair(lead_surrogate, trail_surrogate); + let cp = surroage_pair::combine_surrogate_pair(lead_surrogate, trail_surrogate); // [SS:EE] RegExpIdentifierStart :: UnicodeLeadSurrogate UnicodeTrailSurrogate // It is a Syntax Error if the RegExpIdentifierCodePoint of RegExpIdentifierStart is not matched by the UnicodeIDStart lexical grammar production. @@ -1907,15 +1908,15 @@ impl<'a> PatternParser<'a> { let span_start = self.reader.offset(); if let Some(lead_surrogate) = - self.reader.peek().filter(|&cp| unicode::is_lead_surrogate(cp)) + self.reader.peek().filter(|&cp| surroage_pair::is_lead_surrogate(cp)) { if let Some(trail_surrogate) = - self.reader.peek2().filter(|&cp| unicode::is_trail_surrogate(cp)) + self.reader.peek2().filter(|&cp| surroage_pair::is_trail_surrogate(cp)) { self.reader.advance(); self.reader.advance(); - let cp = unicode::combine_surrogate_pair(lead_surrogate, trail_surrogate); + let cp = surroage_pair::combine_surrogate_pair(lead_surrogate, trail_surrogate); // [SS:EE] RegExpIdentifierPart :: UnicodeLeadSurrogate UnicodeTrailSurrogate // It is a Syntax Error if the RegExpIdentifierCodePoint of RegExpIdentifierPart is not matched by the UnicodeIDContinue lexical grammar production. if !unicode::is_unicode_id_continue(cp) { @@ -1953,15 +1954,16 @@ impl<'a> PatternParser<'a> { let checkpoint = self.reader.checkpoint(); // HexLeadSurrogate + HexTrailSurrogate - if let Some(lead_surrogate) = - self.consume_fixed_hex_digits(4).filter(|&cp| unicode::is_lead_surrogate(cp)) + if let Some(lead_surrogate) = self + .consume_fixed_hex_digits(4) + .filter(|&cp| surroage_pair::is_lead_surrogate(cp)) { if self.reader.eat2('\\', 'u') { if let Some(trail_surrogate) = self .consume_fixed_hex_digits(4) - .filter(|&cp| unicode::is_trail_surrogate(cp)) + .filter(|&cp| surroage_pair::is_trail_surrogate(cp)) { - return Ok(Some(unicode::combine_surrogate_pair( + return Ok(Some(surroage_pair::combine_surrogate_pair( lead_surrogate, trail_surrogate, ))); @@ -1971,16 +1973,18 @@ impl<'a> PatternParser<'a> { self.reader.rewind(checkpoint); // HexLeadSurrogate - if let Some(lead_surrogate) = - self.consume_fixed_hex_digits(4).filter(|&cp| unicode::is_lead_surrogate(cp)) + if let Some(lead_surrogate) = self + .consume_fixed_hex_digits(4) + .filter(|&cp| surroage_pair::is_lead_surrogate(cp)) { return Ok(Some(lead_surrogate)); } self.reader.rewind(checkpoint); // HexTrailSurrogate - if let Some(trail_surrogate) = - self.consume_fixed_hex_digits(4).filter(|&cp| unicode::is_trail_surrogate(cp)) + if let Some(trail_surrogate) = self + .consume_fixed_hex_digits(4) + .filter(|&cp| surroage_pair::is_trail_surrogate(cp)) { return Ok(Some(trail_surrogate)); } diff --git a/crates/oxc_regular_expression/src/body_parser/unicode.rs b/crates/oxc_regular_expression/src/body_parser/unicode.rs index 7a4249b4734f2..aa1211c6c4e93 100644 --- a/crates/oxc_regular_expression/src/body_parser/unicode.rs +++ b/crates/oxc_regular_expression/src/body_parser/unicode.rs @@ -114,18 +114,6 @@ pub fn is_identifier_part_char(cp: u32) -> bool { char::from_u32(cp).map_or(false, |ch| unicode_id_start::is_id_continue(ch) || ch == '$') } -pub fn is_lead_surrogate(cp: u32) -> bool { - (0xd800..=0xdbff).contains(&cp) -} - -pub fn is_trail_surrogate(cp: u32) -> bool { - (0xdc00..=0xdfff).contains(&cp) -} - -pub fn combine_surrogate_pair(lead: u32, trail: u32) -> u32 { - (lead - 0xd800) * 0x400 + trail - 0xdc00 + 0x10000 -} - pub fn map_control_escape(cp: u32) -> Option { match char::from_u32(cp) { Some('f') => Some(0x0c), diff --git a/crates/oxc_regular_expression/src/display.rs b/crates/oxc_regular_expression/src/display.rs new file mode 100644 index 0000000000000..e309ecbe3dbe1 --- /dev/null +++ b/crates/oxc_regular_expression/src/display.rs @@ -0,0 +1,557 @@ +use std::{ + fmt::{self, Display}, + iter::Peekable, +}; + +#[allow(clippy::wildcard_imports)] +use crate::ast::*; +use crate::surroage_pair::{combine_surrogate_pair, is_lead_surrogate, is_trail_surrogate}; + +impl<'a> Display for RegularExpression<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "/{}/{}", self.pattern, self.flags) + } +} + +impl Display for Flags { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let mut flags = String::with_capacity(8); + macro_rules! if_true_append { + ($flag:ident, $char:literal) => { + if self.$flag { + flags.push($char); + } + }; + } + + // write flags in the order they are described in the `MDN` + // + if_true_append!(has_indices, 'd'); + if_true_append!(global, 'g'); + if_true_append!(ignore_case, 'i'); + if_true_append!(multiline, 'm'); + if_true_append!(dot_all, 's'); + if_true_append!(unicode, 'u'); + if_true_append!(unicode_sets, 'v'); + if_true_append!(sticky, 'y'); + + write!(f, "{flags}") + } +} + +impl<'a> Display for Pattern<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.body) + } +} + +impl<'a> Display for Disjunction<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write_join(f, "|", &self.body) + } +} + +impl<'a> Display for Alternative<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fn as_character<'a>(term: &'a Term) -> Option<&'a Character> { + if let Term::Character(ch) = term { + Some(ch) + } else { + None + } + } + write_join_with(f, "", &self.body, |iter| { + let next = iter.next()?; + let Some(next) = as_character(next) else { return Some(next.to_string()) }; + let peek = iter.peek().and_then(|it| as_character(it)); + let (result, eat) = character_to_string(next, peek); + if eat { + _ = iter.next(); + } + Some(result) + }) + } +} + +impl<'a> Display for Term<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::BoundaryAssertion(it) => write!(f, "{it}"), + Self::LookAroundAssertion(it) => write!(f, "{}", it.as_ref()), + Self::Quantifier(it) => write!(f, "{}", it.as_ref()), + Self::Character(it) => write!(f, "{it}"), + Self::Dot(it) => write!(f, "{it}"), + Self::CharacterClassEscape(it) => write!(f, "{it}"), + Self::UnicodePropertyEscape(it) => write!(f, "{}", it.as_ref()), + Self::CharacterClass(it) => write!(f, "{}", it.as_ref()), + Self::CapturingGroup(it) => write!(f, "{}", it.as_ref()), + Self::IgnoreGroup(it) => write!(f, "{}", it.as_ref()), + Self::IndexedReference(it) => write!(f, "{it}"), + Self::NamedReference(it) => write!(f, "{}", it.as_ref()), + } + } +} + +impl Display for BoundaryAssertion { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.kind) + } +} + +impl Display for BoundaryAssertionKind { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::Start => write!(f, "^"), + Self::End => write!(f, "$"), + Self::Boundary => write!(f, r"\b"), + Self::NegativeBoundary => write!(f, r"\B"), + } + } +} + +impl<'a> Display for LookAroundAssertion<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "({}{})", self.kind, self.body) + } +} + +impl Display for LookAroundAssertionKind { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::Lookahead => write!(f, "?="), + Self::NegativeLookahead => write!(f, "?!"), + Self::Lookbehind => write!(f, "?<="), + Self::NegativeLookbehind => write!(f, "? Display for Quantifier<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.body)?; + + match (self.min, self.max) { + (0, None) => write!(f, "*")?, + (1, None) => write!(f, "+")?, + (0, Some(1)) => write!(f, "?")?, + (min, Some(max)) if min == max => write!(f, "{{{min}}}",)?, + (min, max) => { + let max = max.map_or_else(String::default, |it| it.to_string()); + write!(f, "{{{min},{max}}}",)?; + } + } + + if !self.greedy { + write!(f, "?")?; + } + + Ok(()) + } +} + +impl Display for Character { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let (string, _) = character_to_string(self, None); + write!(f, "{string}") + } +} + +impl Display for Dot { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, ".") + } +} + +impl Display for CharacterClassEscape { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.kind) + } +} + +impl Display for CharacterClassEscapeKind { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::D => write!(f, r"\d"), + Self::NegativeD => write!(f, r"\D"), + Self::S => write!(f, r"\s"), + Self::NegativeS => write!(f, r"\S"), + Self::W => write!(f, r"\w"), + Self::NegativeW => write!(f, r"\W"), + } + } +} + +impl<'a> Display for UnicodePropertyEscape<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + if self.negative { + write!(f, r"\P")?; + } else { + write!(f, r"\p")?; + } + + if let Some(value) = &self.value { + let name = &self.name; + write!(f, "{{{name}={value}}}") + } else { + write!(f, "{{{}}}", self.name) + } + } +} + +impl<'a> Display for CharacterClass<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fn as_character<'a>(content: &'a CharacterClassContents) -> Option<&'a Character> { + if let CharacterClassContents::Character(ch) = content { + Some(ch) + } else { + None + } + } + write!(f, "[")?; + + if !self.body.is_empty() { + if self.negative { + write!(f, "^")?; + } + let sep = match self.kind { + CharacterClassContentsKind::Union => "", + CharacterClassContentsKind::Subtraction => "--", + CharacterClassContentsKind::Intersection => "&&", + }; + write_join_with(f, sep, &self.body, |iter| { + let next = iter.next()?; + let Some(next) = as_character(next) else { return Some(next.to_string()) }; + let peek = iter.peek().and_then(|it| as_character(it)); + let (result, eat) = character_to_string(next, peek); + if eat { + _ = iter.next(); + } + Some(result) + })?; + } + + write!(f, "]") + } +} + +impl<'a> Display for CharacterClassContents<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::CharacterClassRange(it) => write!(f, "{}", it.as_ref()), + Self::CharacterClassEscape(it) => write!(f, "{it}"), + Self::UnicodePropertyEscape(it) => write!(f, "{}", it.as_ref()), + Self::Character(it) => write!(f, "{it}"), + Self::NestedCharacterClass(it) => write!(f, "{}", it.as_ref()), + Self::ClassStringDisjunction(it) => write!(f, "{}", it.as_ref()), + } + } +} + +impl Display for CharacterClassRange { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}-{}", self.min, self.max) + } +} + +impl<'a> Display for ClassStringDisjunction<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, r"\q{{")?; + write_join(f, "|", &self.body)?; + write!(f, "}}") + } +} + +impl<'a> Display for ClassString<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write_join(f, "", &self.body) + } +} + +impl<'a> Display for CapturingGroup<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let body = &self.body; + if let Some(name) = &self.name { + write!(f, "(?<{name}>{body})") + } else { + write!(f, "({body})") + } + } +} + +impl<'a> Display for IgnoreGroup<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fn write_flags( + f: &mut fmt::Formatter<'_>, + prefix: char, + flags: &ModifierFlags, + ) -> fmt::Result { + if flags.ignore_case { + write!(f, "{prefix}i")?; + } + if flags.sticky { + write!(f, "{prefix}y")?; + } + if flags.multiline { + write!(f, "{prefix}m")?; + } + Ok(()) + } + + write!(f, "(?")?; + if let Some(enabling) = &self.enabling_modifiers { + write_flags(f, '\0', enabling)?; + } + if let Some(disabling) = &self.disabling_modifiers { + write_flags(f, '-', disabling)?; + } + write!(f, ":{})", self.body) + } +} + +impl Display for IndexedReference { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, r"\{}", self.index) + } +} + +impl<'a> Display for NamedReference<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, r"\k<{}>", self.name) + } +} + +fn write_join(f: &mut fmt::Formatter<'_>, sep: S, items: I) -> fmt::Result +where + S: AsRef, + E: Display, + I: IntoIterator, +{ + write_join_with(f, sep, items, |iter| iter.next().map(|it| it.to_string())) +} + +fn write_join_with(f: &mut fmt::Formatter<'_>, sep: S, items: I, next: F) -> fmt::Result +where + S: AsRef, + E: Display, + I: IntoIterator, + F: Fn(&mut Peekable) -> Option, +{ + let sep = sep.as_ref(); + let iter = &mut items.into_iter().peekable(); + + if let Some(first) = next(iter) { + write!(f, "{first}")?; + } + + while let Some(it) = next(iter) { + write!(f, "{sep}{it}")?; + } + + Ok(()) +} + +fn character_to_string( + this: &Character, + peek: Option<&Character>, +) -> (/* result */ String, /* true of peek should be consumed */ bool) { + let cp = this.value; + + if let CharacterKind::Symbol = this.kind { + // Trail only + if is_trail_surrogate(cp) { + return (format!(r"\u{cp:X}"), false); + } + + if is_lead_surrogate(cp) { + if let Some(peek) = peek.filter(|peek| is_trail_surrogate(peek.value)) { + // Lead+Trail + let cp = combine_surrogate_pair(cp, peek.value); + let ch = char::from_u32(cp).expect("Invalid surrogate pair `Character`!"); + return (format!("{ch}"), true); + } + + // Lead only + return (format!(r"\u{cp:X}"), false); + } + } + + let ch = char::from_u32(cp).expect("Invalid `Character`!"); + let result = match this.kind { + CharacterKind::ControlLetter => match ch { + '\n' => r"\cJ".to_string(), + '\r' => r"\cM".to_string(), + '\t' => r"\cI".to_string(), + _ => format!(r"\c{ch}"), + }, + CharacterKind::Identifier => { + format!(r"\{ch}") + } + // Not a surrogate, like BMP, or all units in unicode mode + CharacterKind::Symbol => format!("{ch}"), + CharacterKind::Null => String::from(r"\0"), + CharacterKind::UnicodeEscape => { + // we remove the leading `0x` of our 4 digit hex number. + let hex = &format!("{cp:#4X}")[2..]; + if hex.len() <= 4 { + format!(r"\u{hex}") + } else { + format!(r"\u{{{hex}}}") + } + } + CharacterKind::HexadecimalEscape => { + // we remove the leading `0x` of our 2 digit hex number. + let hex = &format!("{cp:#2X}")[2..]; + format!(r"\x{hex}") + } + CharacterKind::Octal => { + let octal = format!("{cp:o}"); + format!(r"\{octal}") + } + CharacterKind::SingleEscape => match ch { + '\n' => String::from(r"\n"), + '\r' => String::from(r"\r"), + '\t' => String::from(r"\t"), + '\u{b}' => String::from(r"\v"), + '\u{c}' => String::from(r"\f"), + '\u{8}' => String::from(r"\b"), + '\u{2D}' => String::from(r"\-"), + _ => format!(r"\{ch}"), + }, + }; + + (result, false) +} + +#[cfg(test)] +mod test { + use oxc_allocator::Allocator; + + type Case<'a> = ( + &'a str, + /* expected display, None means expect the same as original */ Option<&'a str>, + ); + + static CASES: &[Case] = &[ + ("/ab/", None), + ("/ab/u", None), + ("/abc/i", None), + ("/abc/iu", None), + ("/a*?/i", None), + ("/a*?/iu", None), + ("/emo👈🏻ji/", None), + ("/emo👈🏻ji/u", None), + ("/ab|c/i", None), + ("/ab|c/iu", None), + ("/a|b+|c/i", None), + ("/a|b+|c/iu", None), + ("/(?=a)|(?<=b)|(?!c)|(?cg)(?:g)/", None), + (r"/(cg)(?cg)(?:g)/u", None), + (r"/^(?=ab)\b(?!cd)(?<=ef)\B(? + (r"/\1/", None), + (r"/\2/", None), + (r"/\3/", None), + (r"/\4/", None), + (r"/\5/", None), + (r"/\6/", None), + (r"/\7/", None), + // NOTE: we remove leading zeroes + (r"/\00/", Some(r"/\0/")), + // NOTE: we remove leading zeroes + (r"/\07/", Some(r"/\7/")), + (r"/\40/", None), + (r"/\47/", None), + (r"/\70/", None), + (r"/\77/", None), + // NOTE: we remove leading zeroes + (r"/\000/", Some(r"/\0/")), + // NOTE: we remove leading zeroes + (r"/\007/", Some(r"/\7/")), + // NOTE: we remove leading zeroes + (r"/\070/", Some(r"/\70/")), + (r"/\300/", None), + (r"/\307/", None), + (r"/\370/", None), + (r"/\377/", None), + (r"/(.)\1/", None), + // Identity escape from: + (r"/\C/", None), + (r"/O\PQ/", None), + (r"/\8/", None), + (r"/7\89/", None), + (r"/\9/", None), + (r"/8\90/", None), + (r"/(.)(.)(.)(.)(.)(.)(.)(.)\8\8/", None), + // Class escape from: + (r"/\c0/", None), + (r"/[\c0]/", None), + (r"/\c1/", None), + (r"/[\c10]+/", None), + (r"/\c8/", None), + (r"/[\c8]/", None), + (r"/[\c80]+/", None), + (r"/\c_/", None), + ]; + + fn test_display(allocator: &Allocator, (source, expect): &Case) { + use crate::{Parser, ParserOptions}; + let expect = expect.unwrap_or(source); + let actual = Parser::new(allocator, source, ParserOptions::default()).parse().unwrap(); + assert_eq!(expect, actual.to_string()); + } + + #[test] + fn test() { + let allocator = &Allocator::default(); + CASES.iter().for_each(|case| test_display(allocator, case)); + } +} diff --git a/crates/oxc_regular_expression/src/lib.rs b/crates/oxc_regular_expression/src/lib.rs index ae2f1c0a58bc0..1a4df8a4e9025 100644 --- a/crates/oxc_regular_expression/src/lib.rs +++ b/crates/oxc_regular_expression/src/lib.rs @@ -2,10 +2,12 @@ pub mod ast; mod body_parser; +mod display; mod flag_parser; mod literal_parser; mod options; mod span; +mod surroage_pair; pub use crate::body_parser::PatternParser; pub use crate::flag_parser::FlagsParser; diff --git a/crates/oxc_regular_expression/src/surroage_pair.rs b/crates/oxc_regular_expression/src/surroage_pair.rs new file mode 100644 index 0000000000000..30a54d2f59a96 --- /dev/null +++ b/crates/oxc_regular_expression/src/surroage_pair.rs @@ -0,0 +1,11 @@ +pub fn is_lead_surrogate(cp: u32) -> bool { + (0xd800..=0xdbff).contains(&cp) +} + +pub fn is_trail_surrogate(cp: u32) -> bool { + (0xdc00..=0xdfff).contains(&cp) +} + +pub fn combine_surrogate_pair(lead: u32, trail: u32) -> u32 { + (lead - 0xd800) * 0x400 + trail - 0xdc00 + 0x10000 +}