Skip to content

Commit

Permalink
chore(regular_expression): Update example to support RegExp construct…
Browse files Browse the repository at this point in the history
…or (#5106)

- Fix example to handle `new RegExp()` too
- Update NOTE comments

- - -

Until I tried interacting with the actual AST parsed by `oxc_parser`, I thought that the current `oxc_regular_expression` lacked support for the `RegExp` constructor due to escape sequences.

This was because `"\""` remained `"\""` after reading the source text from `.js` files.

However, once it was parsed by `oxc_parser`, I found that everything was [resolved](https://github.com/oxc-project/oxc/blob/8ef85a43c019a1ce9aa50b61ec4dbb5dbaeb3b7b/crates/oxc_parser/src/lexer/string.rs)! (Wonderful work as usual. 👏🏻 )

Now there is nothing to worry about. 😌
  • Loading branch information
leaysgur committed Aug 23, 2024
1 parent aa7718a commit c7b81f5
Show file tree
Hide file tree
Showing 3 changed files with 86 additions and 18 deletions.
78 changes: 65 additions & 13 deletions crates/oxc_regular_expression/examples/parse_file.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
#![allow(clippy::print_stdout)]
#![allow(clippy::print_stdout, clippy::cast_possible_truncation)]
use std::{env, fs, path::Path, sync::Arc};

use oxc_allocator::Allocator;
use oxc_ast::AstKind;
use oxc_ast::{ast, AstKind};
use oxc_parser::Parser;
use oxc_regular_expression::{FlagsParser, ParserOptions, PatternParser};
use oxc_semantic::SemanticBuilder;
use oxc_span::SourceType;

fn main() {
// 1. Get the file content and parse
let name = env::args().nth(1).unwrap_or_else(|| "test.js".to_string());
let path = Path::new(&name);

Expand All @@ -26,23 +28,31 @@ fn main() {
return;
}

// 2. Build the semantic to iteralate over the nodes
let program = allocator.alloc(parser_ret.program);
let semantic_ret = SemanticBuilder::new(&source_text, source_type).build(program);
let semantic = semantic_ret.semantic;

// 3. Parse regular expressions
// - RegExpLiteral
// - new RegExp() with string or template literal if static
for node in semantic.nodes().iter() {
match node.kind() {
AstKind::RegExpLiteral(re) => {
let literal = re.span.source_text(&source_text);
let parsed = oxc_regular_expression::Parser::new(
println!("🍀 {}", re.span.source_text(&source_text));

let parsed = PatternParser::new(
&allocator,
literal,
oxc_regular_expression::ParserOptions::default()
.with_span_offset(re.span.start),
re.regex.pattern.as_str(),
ParserOptions {
span_offset: re.span.start + 1,
unicode_mode: re.regex.flags.contains(ast::RegExpFlags::U)
|| re.regex.flags.contains(ast::RegExpFlags::V),
unicode_sets_mode: re.regex.flags.contains(ast::RegExpFlags::V),
},
)
.parse();

println!("🍀 {literal}");
if let Err(error) = parsed {
let error = error.with_source_code(Arc::clone(&source_text));
println!("{error:?}");
Expand All @@ -51,18 +61,60 @@ fn main() {
println!("{parsed:#?}");
println!();
}
AstKind::NewExpression(new_expr) => {
AstKind::NewExpression(new_expr)
if new_expr
.callee
.get_identifier_reference()
.filter(|ident| ident.name == "RegExp")
.is_some()
{
println!("👻 TODO: new RegExp(...)");
println!();
.is_some() =>
{
println!("🍀 {}", new_expr.span.source_text(&source_text));

let pattern = match new_expr.arguments.first() {
Some(ast::Argument::StringLiteral(sl)) => &sl.value,
Some(ast::Argument::TemplateLiteral(tl))
if tl.is_no_substitution_template() =>
{
&tl.quasi().unwrap()
}
_ => {
continue;
}
};

let flags = match new_expr.arguments.get(1) {
Some(ast::Argument::StringLiteral(sl)) => &sl.value,
Some(ast::Argument::TemplateLiteral(tl))
if tl.is_no_substitution_template() =>
{
&tl.quasi().unwrap()
}
_ => "",
};

let flags =
FlagsParser::new(&allocator, flags, ParserOptions::default()).parse().unwrap();
let parsed = PatternParser::new(
&allocator,
pattern,
ParserOptions {
span_offset: new_expr.span.start + 12, // = "new RegExp(\"".len()
unicode_mode: flags.unicode || flags.unicode_sets,
unicode_sets_mode: flags.unicode_sets,
},
)
.parse();

if let Err(error) = parsed {
let error = error.with_source_code(Arc::clone(&source_text));
println!("{error:?}");
return;
}
println!("{parsed:#?}");
println!();
}
_ => {}
}
}
println!("✨ All parsed!");
}
10 changes: 7 additions & 3 deletions crates/oxc_regular_expression/examples/test.js
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
const re1 = /abc{1}/gsv;
const re2 = new RegExp("ooo", "u");
const re3 = /[\w--[v]]/gsv;
// All of them should be the same result!
[
/\1(.)\\"'`a/v,
new RegExp("\\1(.)\\\\\"'`\a","v"),
new RegExp('\\1(.)\\\\"\'`\a','v'),
new RegExp(`\\1(.)\\\\"'\`\a`,`v`),
]
16 changes: 14 additions & 2 deletions crates/oxc_regular_expression/src/body_parser/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,29 @@ pub struct Reader<'a> {
unicode_mode: bool,
/// Current index for `u8_units`(unicode mode) or `u16_units`(non-unicode mode).
index: usize,
// NOTE: Distinguish these 2 units looks cleaner, but it may not be necessary.
//
// If I understand correctly (and there are no unexpected factors),
// AST `Character[kind=Symbol]` only needs to be aware of this for surrogate pairs.
//
// Therefore, performance might be improved by:
// - using only `u8_units`, and
// - checking if each unit (char) is non-BMP, and if so, converting it into a surrogate pair and emitting 2 units.
// However, I'm not certain this approach is faster than current one using `encode_utf16()` all at once.
/// Iteration units for unicode mode.
/// Even in non-unicode mode, used for `Span` offset calculation.
u8_units: Vec<(usize, char)>,
/// Iteration units for non-unicode mode.
u16_units: Vec<u16>,
/// Last offset caches for non-unicode mode.
last_offset_indices: (usize, usize),
}

impl<'a> Reader<'a> {
pub fn new(source: &'a str, unicode_mode: bool) -> Self {
// NOTE: Distinguish these 2 units looks cleaner, but it may not be necessary.
// As as a parser, AST `Character[kind=Symbol]` only needs to be aware of this for surrogate pairs.
// NOTE: Collecting `Vec` may not be efficient if the source is too large.
// Implements lookahead cache with `VecDeque` is better...?
// But when I tried once, there are no notable improvements.
let u8_units = source.char_indices().collect::<Vec<_>>();
let u16_units = if unicode_mode { "" } else { source }.encode_utf16().collect::<Vec<_>>();

Expand All @@ -26,6 +36,8 @@ impl<'a> Reader<'a> {
if self.unicode_mode {
self.u8_units.get(self.index).map_or(self.source.len(), |(idx, _)| *idx)
} else {
// NOTE: This does not return valid `Span` offset for surrogate pairs.
// In the first place, there is no such thing as string slice corresponding to them...
let (mut u16_idx, mut u8_idx) = self.last_offset_indices;
for (idx, ch) in &self.u8_units[u8_idx..] {
if self.index <= u16_idx {
Expand Down

0 comments on commit c7b81f5

Please sign in to comment.