From c7316856dbe85b5d7435149399765627825f488e Mon Sep 17 00:00:00 2001 From: overlookmotel Date: Fri, 12 Jan 2024 03:36:30 +0000 Subject: [PATCH] refactor(parser): reduce work parsing regexps (#1999) #1926 produced a small performance regression because when parsing a regexp, some work is repeated. --- crates/oxc_parser/src/cursor.rs | 7 +++++-- crates/oxc_parser/src/js/expression.rs | 21 ++++----------------- crates/oxc_parser/src/lexer/mod.rs | 24 ++++++++++++++---------- 3 files changed, 23 insertions(+), 29 deletions(-) diff --git a/crates/oxc_parser/src/cursor.rs b/crates/oxc_parser/src/cursor.rs index eb9e3e7f10bb5..79511adcb78ff 100644 --- a/crates/oxc_parser/src/cursor.rs +++ b/crates/oxc_parser/src/cursor.rs @@ -1,5 +1,6 @@ //! Code related to navigating `Token`s from the lexer +use oxc_ast::ast::RegExpFlags; use oxc_diagnostics::Result; use oxc_span::Span; @@ -200,8 +201,10 @@ impl<'a> Parser<'a> { } /// Tell lexer to read a regex - pub(crate) fn read_regex(&mut self) { - self.token = self.lexer.next_regex(self.cur_kind()); + pub(crate) fn read_regex(&mut self) -> (u32, RegExpFlags) { + let (token, pattern_end, flags) = self.lexer.next_regex(self.cur_kind()); + self.token = token; + (pattern_end, flags) } /// Tell lexer to read a template substitution tail diff --git a/crates/oxc_parser/src/js/expression.rs b/crates/oxc_parser/src/js/expression.rs index 7cf643f4d4e57..ab8f069daf1e9 100644 --- a/crates/oxc_parser/src/js/expression.rs +++ b/crates/oxc_parser/src/js/expression.rs @@ -180,7 +180,6 @@ impl<'a> Parser<'a> { } Kind::LParen => self.parse_parenthesized_expression(span), Kind::Slash | Kind::SlashEq => { - self.read_regex(); let literal = self.parse_literal_regexp(); Ok(self.ast.literal_regexp_expression(literal)) } @@ -320,22 +319,10 @@ impl<'a> Parser<'a> { pub(crate) fn parse_literal_regexp(&mut self) -> RegExpLiteral { let span = self.start_span(); - // split out the flag part of `/regex/flag` by looking for `/` from the end - let regex_src = self.cur_src(); - let mut flags = RegExpFlags::empty(); - - let mut split_index = None; - for (i, c) in regex_src.char_indices().rev() { - if let Ok(flag) = RegExpFlags::try_from(c) { - flags |= flag; - } else { - split_index.replace(i); - break; - } - } - - // `/` are omitted from the pattern - let pattern = split_index.map_or(regex_src, |i| regex_src.get(1..i).unwrap_or("")); + // split out pattern + let (pattern_end, flags) = self.read_regex(); + let pattern_start = self.cur_token().start + 1; // +1 to exclude `/` + let pattern = &self.source_text[pattern_start as usize..pattern_end as usize]; self.bump_any(); self.ast.reg_exp_literal(self.end_span(span), pattern, flags) diff --git a/crates/oxc_parser/src/lexer/mod.rs b/crates/oxc_parser/src/lexer/mod.rs index 15195c350c438..df2c501fe6189 100644 --- a/crates/oxc_parser/src/lexer/mod.rs +++ b/crates/oxc_parser/src/lexer/mod.rs @@ -192,16 +192,17 @@ impl<'a> Lexer<'a> { /// where a `RegularExpressionLiteral` is permitted /// Which means the parser needs to re-tokenize on `PrimaryExpression`, /// `RegularExpressionLiteral` only appear on the right hand side of `PrimaryExpression` - pub fn next_regex(&mut self, kind: Kind) -> Token { + pub fn next_regex(&mut self, kind: Kind) -> (Token, u32, RegExpFlags) { self.current.token.start = self.offset() - match kind { Kind::Slash => 1, Kind::SlashEq => 2, _ => unreachable!(), }; - let kind = self.read_regex(); + let (pattern_end, flags) = self.read_regex(); self.lookahead.clear(); - self.finish_next(kind) + let token = self.finish_next(Kind::RegExp); + (token, pattern_end, flags) } pub fn next_right_angle(&mut self) -> Token { @@ -828,18 +829,20 @@ impl<'a> Lexer<'a> { } /// 12.9.5 Regular Expression Literals - fn read_regex(&mut self) -> Kind { + fn read_regex(&mut self) -> (u32, RegExpFlags) { let mut in_escape = false; let mut in_character_class = false; loop { match self.current.chars.next() { None => { self.error(diagnostics::UnterminatedRegExp(self.unterminated_range())); - return Kind::Undetermined; + return (self.offset(), RegExpFlags::empty()); } Some(c) if is_line_terminator(c) => { self.error(diagnostics::UnterminatedRegExp(self.unterminated_range())); - return Kind::Undetermined; + #[allow(clippy::cast_possible_truncation)] + let pattern_end = self.offset() - c.len_utf8() as u32; + return (pattern_end, RegExpFlags::empty()); } Some(c) => { if in_escape { @@ -857,28 +860,29 @@ impl<'a> Lexer<'a> { } } + let pattern_end = self.offset() - 1; // -1 to exclude `/` let mut flags = RegExpFlags::empty(); while let Some(ch @ ('$' | '_' | 'a'..='z' | 'A'..='Z' | '0'..='9')) = self.peek() { self.current.chars.next(); if !ch.is_ascii_lowercase() { self.error(diagnostics::RegExpFlag(ch, self.current_offset())); - return Kind::Undetermined; + break; } let flag = if let Ok(flag) = RegExpFlags::try_from(ch) { flag } else { self.error(diagnostics::RegExpFlag(ch, self.current_offset())); - return Kind::Undetermined; + break; }; if flags.contains(flag) { self.error(diagnostics::RegExpFlagTwice(ch, self.current_offset())); - return Kind::Undetermined; + break; } flags |= flag; } - Kind::RegExp + (pattern_end, flags) } /// 12.8.6 Template Literal Lexical Components