From c7b81f5762b0a6880307f73a31beb5adbd88f01e Mon Sep 17 00:00:00 2001 From: leaysgur <6259812+leaysgur@users.noreply.github.com> Date: Fri, 23 Aug 2024 04:57:31 +0000 Subject: [PATCH] chore(regular_expression): Update example to support RegExp constructor (#5106) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fix example to handle `new RegExp()` too - Update NOTE comments - - - Until I tried interacting with the actual AST parsed by `oxc_parser`, I thought that the current `oxc_regular_expression` lacked support for the `RegExp` constructor due to escape sequences. This was because `"\""` remained `"\""` after reading the source text from `.js` files. However, once it was parsed by `oxc_parser`, I found that everything was [resolved](https://github.com/oxc-project/oxc/blob/8ef85a43c019a1ce9aa50b61ec4dbb5dbaeb3b7b/crates/oxc_parser/src/lexer/string.rs)! (Wonderful work as usual. 👏🏻 ) Now there is nothing to worry about. 😌 --- .../examples/parse_file.rs | 78 +++++++++++++++---- .../oxc_regular_expression/examples/test.js | 10 ++- .../src/body_parser/reader.rs | 16 +++- 3 files changed, 86 insertions(+), 18 deletions(-) diff --git a/crates/oxc_regular_expression/examples/parse_file.rs b/crates/oxc_regular_expression/examples/parse_file.rs index f25c068819712..2966b12325858 100644 --- a/crates/oxc_regular_expression/examples/parse_file.rs +++ b/crates/oxc_regular_expression/examples/parse_file.rs @@ -1,13 +1,15 @@ -#![allow(clippy::print_stdout)] +#![allow(clippy::print_stdout, clippy::cast_possible_truncation)] use std::{env, fs, path::Path, sync::Arc}; use oxc_allocator::Allocator; -use oxc_ast::AstKind; +use oxc_ast::{ast, AstKind}; use oxc_parser::Parser; +use oxc_regular_expression::{FlagsParser, ParserOptions, PatternParser}; use oxc_semantic::SemanticBuilder; use oxc_span::SourceType; fn main() { + // 1. Get the file content and parse let name = env::args().nth(1).unwrap_or_else(|| "test.js".to_string()); let path = Path::new(&name); @@ -26,23 +28,31 @@ fn main() { return; } + // 2. Build the semantic to iteralate over the nodes let program = allocator.alloc(parser_ret.program); let semantic_ret = SemanticBuilder::new(&source_text, source_type).build(program); let semantic = semantic_ret.semantic; + // 3. Parse regular expressions + // - RegExpLiteral + // - new RegExp() with string or template literal if static for node in semantic.nodes().iter() { match node.kind() { AstKind::RegExpLiteral(re) => { - let literal = re.span.source_text(&source_text); - let parsed = oxc_regular_expression::Parser::new( + println!("🍀 {}", re.span.source_text(&source_text)); + + let parsed = PatternParser::new( &allocator, - literal, - oxc_regular_expression::ParserOptions::default() - .with_span_offset(re.span.start), + re.regex.pattern.as_str(), + ParserOptions { + span_offset: re.span.start + 1, + unicode_mode: re.regex.flags.contains(ast::RegExpFlags::U) + || re.regex.flags.contains(ast::RegExpFlags::V), + unicode_sets_mode: re.regex.flags.contains(ast::RegExpFlags::V), + }, ) .parse(); - println!("🍀 {literal}"); if let Err(error) = parsed { let error = error.with_source_code(Arc::clone(&source_text)); println!("{error:?}"); @@ -51,18 +61,60 @@ fn main() { println!("{parsed:#?}"); println!(); } - AstKind::NewExpression(new_expr) => { + AstKind::NewExpression(new_expr) if new_expr .callee .get_identifier_reference() .filter(|ident| ident.name == "RegExp") - .is_some() - { - println!("👻 TODO: new RegExp(...)"); - println!(); + .is_some() => + { + println!("🍀 {}", new_expr.span.source_text(&source_text)); + + let pattern = match new_expr.arguments.first() { + Some(ast::Argument::StringLiteral(sl)) => &sl.value, + Some(ast::Argument::TemplateLiteral(tl)) + if tl.is_no_substitution_template() => + { + &tl.quasi().unwrap() + } + _ => { + continue; + } + }; + + let flags = match new_expr.arguments.get(1) { + Some(ast::Argument::StringLiteral(sl)) => &sl.value, + Some(ast::Argument::TemplateLiteral(tl)) + if tl.is_no_substitution_template() => + { + &tl.quasi().unwrap() + } + _ => "", + }; + + let flags = + FlagsParser::new(&allocator, flags, ParserOptions::default()).parse().unwrap(); + let parsed = PatternParser::new( + &allocator, + pattern, + ParserOptions { + span_offset: new_expr.span.start + 12, // = "new RegExp(\"".len() + unicode_mode: flags.unicode || flags.unicode_sets, + unicode_sets_mode: flags.unicode_sets, + }, + ) + .parse(); + + if let Err(error) = parsed { + let error = error.with_source_code(Arc::clone(&source_text)); + println!("{error:?}"); + return; } + println!("{parsed:#?}"); + println!(); } _ => {} } } + println!("✨ All parsed!"); } diff --git a/crates/oxc_regular_expression/examples/test.js b/crates/oxc_regular_expression/examples/test.js index 3b2b37e9414ad..7d740243972f4 100644 --- a/crates/oxc_regular_expression/examples/test.js +++ b/crates/oxc_regular_expression/examples/test.js @@ -1,3 +1,7 @@ -const re1 = /abc{1}/gsv; -const re2 = new RegExp("ooo", "u"); -const re3 = /[\w--[v]]/gsv; +// All of them should be the same result! +[ + /\1(.)\\"'`a/v, + new RegExp("\\1(.)\\\\\"'`\a","v"), + new RegExp('\\1(.)\\\\"\'`\a','v'), + new RegExp(`\\1(.)\\\\"'\`\a`,`v`), +] diff --git a/crates/oxc_regular_expression/src/body_parser/reader.rs b/crates/oxc_regular_expression/src/body_parser/reader.rs index a9453f46adce9..f066bdceaa121 100644 --- a/crates/oxc_regular_expression/src/body_parser/reader.rs +++ b/crates/oxc_regular_expression/src/body_parser/reader.rs @@ -3,8 +3,19 @@ pub struct Reader<'a> { unicode_mode: bool, /// Current index for `u8_units`(unicode mode) or `u16_units`(non-unicode mode). index: usize, + // NOTE: Distinguish these 2 units looks cleaner, but it may not be necessary. + // + // If I understand correctly (and there are no unexpected factors), + // AST `Character[kind=Symbol]` only needs to be aware of this for surrogate pairs. + // + // Therefore, performance might be improved by: + // - using only `u8_units`, and + // - checking if each unit (char) is non-BMP, and if so, converting it into a surrogate pair and emitting 2 units. + // However, I'm not certain this approach is faster than current one using `encode_utf16()` all at once. + /// Iteration units for unicode mode. /// Even in non-unicode mode, used for `Span` offset calculation. u8_units: Vec<(usize, char)>, + /// Iteration units for non-unicode mode. u16_units: Vec, /// Last offset caches for non-unicode mode. last_offset_indices: (usize, usize), @@ -12,10 +23,9 @@ pub struct Reader<'a> { impl<'a> Reader<'a> { pub fn new(source: &'a str, unicode_mode: bool) -> Self { - // NOTE: Distinguish these 2 units looks cleaner, but it may not be necessary. - // As as a parser, AST `Character[kind=Symbol]` only needs to be aware of this for surrogate pairs. // NOTE: Collecting `Vec` may not be efficient if the source is too large. // Implements lookahead cache with `VecDeque` is better...? + // But when I tried once, there are no notable improvements. let u8_units = source.char_indices().collect::>(); let u16_units = if unicode_mode { "" } else { source }.encode_utf16().collect::>(); @@ -26,6 +36,8 @@ impl<'a> Reader<'a> { if self.unicode_mode { self.u8_units.get(self.index).map_or(self.source.len(), |(idx, _)| *idx) } else { + // NOTE: This does not return valid `Span` offset for surrogate pairs. + // In the first place, there is no such thing as string slice corresponding to them... let (mut u16_idx, mut u8_idx) = self.last_offset_indices; for (idx, ch) in &self.u8_units[u8_idx..] { if self.index <= u16_idx {