From 9cfb26145e1aa0d6c6c7369559e718d16906d650 Mon Sep 17 00:00:00 2001 From: y21 <30553356+y21@users.noreply.github.com> Date: Mon, 25 Dec 2023 01:16:09 +0100 Subject: [PATCH] add `RegExp.prototype.exec` --- crates/dash_middle/src/compiler/constant.rs | 2 +- crates/dash_middle/src/parser/expr.rs | 4 +- crates/dash_regex/src/lib.rs | 21 ++++++++- crates/dash_regex/src/matcher.rs | 40 +++++++++++----- crates/dash_regex/src/parser.rs | 19 ++++++-- crates/dash_regex/src/visitor.rs | 38 +++++++++------ crates/dash_vm/src/js_std/regex.rs | 52 +++++++++++++++++++-- crates/dash_vm/src/lib.rs | 1 + crates/dash_vm/src/statics.rs | 2 + crates/dash_vm/src/value/regex.rs | 8 ++-- testrunner/src/cmd/run.rs | 6 +-- 11 files changed, 149 insertions(+), 44 deletions(-) diff --git a/crates/dash_middle/src/compiler/constant.rs b/crates/dash_middle/src/compiler/constant.rs index 08e8332f..63d73f02 100755 --- a/crates/dash_middle/src/compiler/constant.rs +++ b/crates/dash_middle/src/compiler/constant.rs @@ -104,7 +104,7 @@ pub enum Constant { Identifier(Rc), Boolean(bool), Function(Rc), - Regex(dash_regex::Regex, Rc), + Regex(dash_regex::ParsedRegex, Rc), Null, Undefined, } diff --git a/crates/dash_middle/src/parser/expr.rs b/crates/dash_middle/src/parser/expr.rs index de5ed6a3..27c4c422 100644 --- a/crates/dash_middle/src/parser/expr.rs +++ b/crates/dash_middle/src/parser/expr.rs @@ -159,7 +159,7 @@ impl ExprKind { Self::Literal(LiteralExpr::Undefined) } - pub fn regex_literal(regex: dash_regex::Regex, source: Symbol) -> Self { + pub fn regex_literal(regex: dash_regex::ParsedRegex, source: Symbol) -> Self { Self::Literal(LiteralExpr::Regex(regex, source)) } @@ -499,7 +499,7 @@ pub enum LiteralExpr { String(Symbol), #[display(fmt = "/{_1}/")] - Regex(dash_regex::Regex, Symbol), + Regex(dash_regex::ParsedRegex, Symbol), #[display(fmt = "null")] Null, diff --git a/crates/dash_regex/src/lib.rs b/crates/dash_regex/src/lib.rs index 9a1c4204..e22b7731 100644 --- a/crates/dash_regex/src/lib.rs +++ b/crates/dash_regex/src/lib.rs @@ -10,7 +10,7 @@ pub mod parser; mod stream; mod visitor; -pub type Regex = Vec; +pub use parser::ParsedRegex; #[cfg(test)] #[test] @@ -22,7 +22,20 @@ pub fn test() { fn matches(regex: &str, input: &str) -> bool { let nodes = Parser::new(regex.as_bytes()).parse_all().unwrap(); let mut matcher = Matcher::new(&nodes, input.as_bytes()); - matcher.matches().is_some() + matcher.matches() + } + + fn matches_groups(regex: &str, input: &str, groups: &[&str]) -> bool { + let nodes = Parser::new(regex.as_bytes()).parse_all().unwrap(); + let mut matcher = Matcher::new(&nodes, input.as_bytes()); + matcher.matches() + && nodes.group_count - 1 == groups.len() + && matcher + .groups + .iter() + .skip(1) + .zip(groups) + .all(|(group, expected)| group.map(|range| &input[range]) == Some(*expected)) } const HEX_REGEX: &str = "^#?([0-9a-fA-F]{2})([0-9a-fA-F]{2})([0-9a-fA-F]{2})([0-9a-fA-F]{2})$"; @@ -33,4 +46,8 @@ pub fn test() { assert!(matches("\\d", "a1")); assert!(matches("V\\dX", "aV1aVaXaV1Xs")); assert!(!matches("V\\dX", "aV1aVaXaV?Xs")); + + const RGB: &str = r"rgb[\s|\(]+((?:[-\+]?\d*\.\d+%?)|(?:[-\+]?\d+%?))[,|\s]+((?:[-\+]?\d*\.\d+%?)|(?:[-\+]?\d+%?))[,|\s]+((?:[-\+]?\d*\.\d+%?)|(?:[-\+]?\d+%?))\s*\)?"; + assert!(matches(RGB, "rgb(255, 255, 255)")); + assert!(matches_groups(RGB, "rgb(144, 17, 9)", &["144", "17", "9"])); } diff --git a/crates/dash_regex/src/matcher.rs b/crates/dash_regex/src/matcher.rs index 09609a96..a4473458 100644 --- a/crates/dash_regex/src/matcher.rs +++ b/crates/dash_regex/src/matcher.rs @@ -3,28 +3,47 @@ use std::ops::Range; use smallvec::{smallvec, SmallVec}; use crate::node::Node; +use crate::parser::ParsedRegex; use crate::stream::BorrowedStream; use crate::visitor::Visit; pub struct Matcher<'a> { nodes: BorrowedStream<'a, Node>, text: BorrowedStream<'a, u8>, + pub groups: Groups, } -#[derive(Debug)] -pub struct Match { - pub groups: SmallVec<[Range; 1]>, +#[derive(Debug, Clone)] +pub struct Groups(SmallVec<[Option>; 1]>); + +impl Groups { + pub fn new(count: usize) -> Self { + Self(smallvec![None; count]) + } + + pub fn set(&mut self, index: usize, range: Range) { + self.0[index] = Some(range); + } + + pub fn get(&mut self, index: usize) -> Option> { + self.0[index].clone() + } + + pub fn iter(&self) -> impl Iterator>> + '_ { + self.0.iter().cloned() + } } impl<'a> Matcher<'a> { - pub fn new(nodes: &'a [Node], text: &'a [u8]) -> Self { + pub fn new(parsed_regex: &'a ParsedRegex, text: &'a [u8]) -> Self { Self { - nodes: BorrowedStream::new(nodes), + nodes: BorrowedStream::new(parsed_regex.nodes.as_slice()), text: BorrowedStream::new(text), + groups: Groups::new(parsed_regex.group_count), } } - pub fn matches(&mut self) -> Option { + pub fn matches(&mut self) -> bool { let mut index = self.text.index(); // TODO: what if text.len() == 0? @@ -32,9 +51,8 @@ impl<'a> Matcher<'a> { while index < self.text.len() { if self.nodes.is_eof() { // all regex nodes matched - return Some(Match { - groups: smallvec![index..self.text.index()], - }); + self.groups.set(0, index..self.text.index()); + return true; } if !self.matches_single() { @@ -44,11 +62,11 @@ impl<'a> Matcher<'a> { } } - None + false } pub fn matches_single(&mut self) -> bool { let node = self.nodes.next().unwrap(); - node.matches(&mut self.text) + node.matches(&mut self.text, &mut self.groups) } } diff --git a/crates/dash_regex/src/parser.rs b/crates/dash_regex/src/parser.rs index 6c8ea10c..aaa23eee 100644 --- a/crates/dash_regex/src/parser.rs +++ b/crates/dash_regex/src/parser.rs @@ -1,5 +1,7 @@ use std::mem; +use serde::{Deserialize, Serialize}; + use crate::error::Error; use crate::node::{Anchor, CharacterClassItem, GroupCaptureMode, MetaSequence, Node}; @@ -9,6 +11,13 @@ pub struct Parser<'a> { group_index: usize, } +#[derive(Debug, Clone)] +#[cfg_attr(feature = "format", derive(Serialize, Deserialize))] +pub struct ParsedRegex { + pub nodes: Vec, + pub group_count: usize, +} + impl<'a> Parser<'a> { pub fn new(input: &'a [u8]) -> Self { Self { @@ -41,7 +50,7 @@ impl<'a> Parser<'a> { self.index >= self.input.len() } - pub fn parse_all(mut self) -> Result, Error> { + pub fn parse_all(mut self) -> Result { let mut nodes = Vec::new(); while !self.is_eof() { if let Some(b'|') = self.current() { @@ -62,7 +71,10 @@ impl<'a> Parser<'a> { nodes.push(self.parse_primary()?); } } - Ok(nodes) + Ok(ParsedRegex { + nodes, + group_count: self.group_index, + }) } fn parse_primary(&mut self) -> Result { @@ -169,7 +181,8 @@ impl<'a> Parser<'a> { GroupCaptureMode::Id(self.group_index - 1) } } else { - GroupCaptureMode::None + self.group_index += 1; + GroupCaptureMode::Id(self.group_index - 1) }; while !self.is_eof() { diff --git a/crates/dash_regex/src/visitor.rs b/crates/dash_regex/src/visitor.rs index 3dae35e8..cbf93d4c 100644 --- a/crates/dash_regex/src/visitor.rs +++ b/crates/dash_regex/src/visitor.rs @@ -1,12 +1,13 @@ -use crate::node::{Anchor, CharacterClassItem, MetaSequence, Node}; +use crate::matcher::Groups; +use crate::node::{Anchor, CharacterClassItem, GroupCaptureMode, MetaSequence, Node}; use crate::stream::BorrowedStream; pub trait Visit<'a> { - fn matches(&self, s: &mut BorrowedStream<'a, u8>) -> bool; + fn matches(&self, s: &mut BorrowedStream<'a, u8>, groups: &mut Groups) -> bool; } impl<'a> Visit<'a> for Anchor { - fn matches(&self, s: &mut BorrowedStream<'a, u8>) -> bool { + fn matches(&self, s: &mut BorrowedStream<'a, u8>, _: &mut Groups) -> bool { match self { Anchor::StartOfString => s.index() == 0, Anchor::EndOfString => s.is_eof(), @@ -15,7 +16,7 @@ impl<'a> Visit<'a> for Anchor { } impl<'a> Visit<'a> for MetaSequence { - fn matches(&self, s: &mut BorrowedStream<'a, u8>) -> bool { + fn matches(&self, s: &mut BorrowedStream<'a, u8>, _: &mut Groups) -> bool { match self { Self::Digit => { let is_digit = s.current().map(|c| c.is_ascii_digit()).unwrap_or(false); @@ -46,7 +47,7 @@ impl<'a> Visit<'a> for MetaSequence { } impl<'a> Visit<'a> for Node { - fn matches(&self, s: &mut BorrowedStream<'a, u8>) -> bool { + fn matches(&self, s: &mut BorrowedStream<'a, u8>, groups: &mut Groups) -> bool { match self { Node::LiteralCharacter(lit) => { let matches = s.current().map(|c| c == lit).unwrap_or(false); @@ -56,21 +57,32 @@ impl<'a> Visit<'a> for Node { matches } Node::Optional(node) => { - node.matches(s); + node.matches(s, groups); true } - Node::Group(_, group) => group.iter().all(|node| node.matches(s)), + Node::Group(capture, group) => { + let before = s.index(); + let all_matched = group.iter().all(|node| node.matches(s, groups)); + + match capture { + GroupCaptureMode::Id(id) if all_matched => { + groups.set(*id, before..s.index()); + true + } + _ => all_matched, + } + } Node::Or(left, right) => { let left_index = s.index(); - let left_matches = left.iter().all(|node| node.matches(s)); + let left_matches = left.iter().all(|node| node.matches(s, groups)); if left_matches { return true; } s.set_index(left_index); - right.iter().all(|node| node.matches(s)) + right.iter().all(|node| node.matches(s, groups)) } - Node::Anchor(anchor) => anchor.matches(s), - Node::MetaSequence(seq) => seq.matches(s), + Node::Anchor(anchor) => anchor.matches(s, groups), + Node::MetaSequence(seq) => seq.matches(s, groups), Node::Repetition { node, min, max } => { let mut count = 0; while !s.is_eof() { @@ -80,7 +92,7 @@ impl<'a> Visit<'a> for Node { } } - if !node.matches(s) { + if !node.matches(s, groups) { break; } count += 1; @@ -98,7 +110,7 @@ impl<'a> Visit<'a> for Node { let Some(&cur) = s.current() else { return false }; nodes.iter().any(|node| match *node { - CharacterClassItem::Node(ref node) => node.matches(s), + CharacterClassItem::Node(ref node) => node.matches(s, groups), CharacterClassItem::Range(start, end) => { let matches = (start..=end).contains(&cur); if matches { diff --git a/crates/dash_vm/src/js_std/regex.rs b/crates/dash_vm/src/js_std/regex.rs index 38680f65..f0936768 100644 --- a/crates/dash_vm/src/js_std/regex.rs +++ b/crates/dash_vm/src/js_std/regex.rs @@ -1,5 +1,7 @@ use crate::throw; +use crate::value::array::Array; use crate::value::function::native::CallContext; +use crate::value::object::PropertyValue; use crate::value::ops::conversions::ValueConversion; use crate::value::regex::{RegExp, RegExpInner}; use crate::value::{Value, ValueContext}; @@ -27,7 +29,7 @@ pub fn test(cx: CallContext) -> Result { None => throw!(cx.scope, TypeError, "Receiver must be a RegExp"), }; - let RegExpInner { nodes, last_index, .. } = match regex.inner() { + let RegExpInner { regex, last_index, .. } = match regex.inner() { Some(nodes) => nodes, None => throw!(cx.scope, TypeError, "Receiver must be an initialized RegExp object"), }; @@ -37,12 +39,54 @@ pub fn test(cx: CallContext) -> Result { return Ok(Value::Boolean(false)); } - let mut matcher = RegexMatcher::new(nodes, text[last_index.get()..].as_bytes()); - if let Some(m) = matcher.matches() { - last_index.set(last_index.get() + m.groups[0].end); + let mut matcher = RegexMatcher::new(regex, text[last_index.get()..].as_bytes()); + if matcher.matches() { + last_index.set(last_index.get() + matcher.groups.get(0).unwrap().end); Ok(Value::Boolean(true)) } else { last_index.set(0); Ok(Value::Boolean(false)) } } + +pub fn exec(cx: CallContext<'_, '_>) -> Result { + let text = cx.args.first().unwrap_or_undefined().to_string(cx.scope)?; + + let regex = match cx.this.downcast_ref::() { + Some(regex) => regex, + None => throw!(cx.scope, TypeError, "Receiver must be a RegExp"), + }; + + let RegExpInner { regex, last_index, .. } = match regex.inner() { + Some(nodes) => nodes, + None => throw!(cx.scope, TypeError, "Receiver must be an initialized RegExp object"), + }; + + if last_index.get() >= text.len() { + last_index.set(0); + return Ok(Value::null()); + } + + let mut matcher = RegexMatcher::new(regex, text[last_index.get()..].as_bytes()); + if matcher.matches() { + last_index.set(last_index.get() + matcher.groups.get(0).unwrap().end); + let groups = Array::from_vec( + cx.scope, + matcher + .groups + .iter() + .map(|g| { + let sub = match g { + Some(r) => text[r].into(), + None => cx.scope.statics.null_str(), + }; + PropertyValue::static_default(Value::String(sub)) + }) + .collect(), + ); + Ok(Value::Object(cx.scope.register(groups))) + } else { + last_index.set(0); + Ok(Value::null()) + } +} diff --git a/crates/dash_vm/src/lib.rs b/crates/dash_vm/src/lib.rs index 6e2ceffb..817783e9 100644 --- a/crates/dash_vm/src/lib.rs +++ b/crates/dash_vm/src/lib.rs @@ -869,6 +869,7 @@ impl Vm { regexp_ctor.clone(), [ ("test", scope.statics.regexp_test.clone()), + ("exec", scope.statics.regexp_exec.clone()) ], [], [], diff --git a/crates/dash_vm/src/statics.rs b/crates/dash_vm/src/statics.rs index fd1ff453..7857d08e 100644 --- a/crates/dash_vm/src/statics.rs +++ b/crates/dash_vm/src/statics.rs @@ -247,6 +247,7 @@ pub struct Statics { pub regexp_ctor: Handle, pub regexp_prototype: Handle, pub regexp_test: Handle, + pub regexp_exec: Handle, pub date_ctor: Handle, pub date_prototype: Handle, pub date_now: Handle, @@ -503,6 +504,7 @@ impl Statics { regexp_ctor: function(gc, "RegExp", js_std::regex::constructor), regexp_prototype: builtin_object(gc, RegExp::empty()), regexp_test: function(gc, "test", js_std::regex::test), + regexp_exec: function(gc, "exec", js_std::regex::exec), date_ctor: function(gc, "Date", js_std::date::constructor), date_prototype: builtin_object(gc, NamedObject::null()), date_now: function(gc, "now", js_std::date::now), diff --git a/crates/dash_vm/src/value/regex.rs b/crates/dash_vm/src/value/regex.rs index 2b9aea01..cca17f93 100644 --- a/crates/dash_vm/src/value/regex.rs +++ b/crates/dash_vm/src/value/regex.rs @@ -2,7 +2,7 @@ use std::cell::Cell; use std::rc::Rc; use dash_proc_macro::Trace; -use dash_regex::node::Node; +use dash_regex::ParsedRegex; use crate::{delegate, Vm}; @@ -10,7 +10,7 @@ use super::object::{NamedObject, Object}; #[derive(Debug)] pub struct RegExpInner { - pub nodes: Vec, + pub regex: ParsedRegex, pub source: Rc, // TODO: this should only exist if the `g` flag is set (we currently don't even have regex flags) pub last_index: Cell, @@ -23,13 +23,13 @@ pub struct RegExp { } impl RegExp { - pub fn new(nodes: Vec, source: Rc, vm: &Vm) -> Self { + pub fn new(regex: ParsedRegex, source: Rc, vm: &Vm) -> Self { let proto = vm.statics.regexp_prototype.clone(); let ctor = vm.statics.regexp_ctor.clone(); Self { inner: Some(RegExpInner { - nodes, + regex, source, last_index: Cell::new(0), }), diff --git a/testrunner/src/cmd/run.rs b/testrunner/src/cmd/run.rs index 37bfdca4..cd992cc5 100644 --- a/testrunner/src/cmd/run.rs +++ b/testrunner/src/cmd/run.rs @@ -1,10 +1,8 @@ use std::collections::HashMap; -use std::ffi::OsStr; -use std::ffi::OsString; +use std::ffi::{OsStr, OsString}; use std::panic; -use std::sync::atomic; use std::sync::atomic::AtomicU32; -use std::sync::Mutex; +use std::sync::{atomic, Mutex}; use clap::ArgMatches; use dash_vm::eval::EvalError;