From 26966190d826f08516f43ae84b1610fc20fcd674 Mon Sep 17 00:00:00 2001 From: overlookmotel Date: Wed, 28 Feb 2024 14:38:22 +0000 Subject: [PATCH] perf(parser): faster lexing template strings --- crates/oxc_parser/src/lexer/mod.rs | 2 - crates/oxc_parser/src/lexer/source.rs | 65 +++- crates/oxc_parser/src/lexer/string_builder.rs | 74 ---- crates/oxc_parser/src/lexer/template.rs | 341 +++++++++++++++--- 4 files changed, 343 insertions(+), 139 deletions(-) delete mode 100644 crates/oxc_parser/src/lexer/string_builder.rs diff --git a/crates/oxc_parser/src/lexer/mod.rs b/crates/oxc_parser/src/lexer/mod.rs index 4502aa3134d00..79ba772cccd57 100644 --- a/crates/oxc_parser/src/lexer/mod.rs +++ b/crates/oxc_parser/src/lexer/mod.rs @@ -19,7 +19,6 @@ mod regex; mod search; mod source; mod string; -mod string_builder; mod template; mod token; mod trivia_builder; @@ -38,7 +37,6 @@ use oxc_span::{SourceType, Span}; use self::{ byte_handlers::handle_byte, source::{Source, SourcePosition}, - string_builder::AutoCow, trivia_builder::TriviaBuilder, }; pub use self::{ diff --git a/crates/oxc_parser/src/lexer/source.rs b/crates/oxc_parser/src/lexer/source.rs index 689067232545b..0ec4a8875f1ab 100644 --- a/crates/oxc_parser/src/lexer/source.rs +++ b/crates/oxc_parser/src/lexer/source.rs @@ -207,7 +207,7 @@ impl<'a> Source<'a> { /// Get string slice from a `SourcePosition` up to the current position of `Source`, /// without checks. /// - /// SAFETY: + /// # SAFETY /// `pos` must not be after current position of `Source`. /// This is always the case if both: /// 1. `Source::set_position` has not been called since `pos` was created. @@ -215,34 +215,63 @@ impl<'a> Source<'a> { #[inline] pub(super) unsafe fn str_from_pos_to_current_unchecked(&self, pos: SourcePosition) -> &'a str { // SAFETY: Caller guarantees `pos` is not after current position of `Source`. + // `self.ptr` is always a valid `SourcePosition` due to invariants of `Source`. + self.str_between_positions_unchecked(pos, SourcePosition::new(self.ptr)) + } + + /// Get string slice from current position of `Source` up to a `SourcePosition`, without checks. + /// + /// # SAFETY + /// `pos` must not be before current position of `Source`. + /// This is always the case if both: + /// 1. `Source::set_position` has not been called since `pos` was created. + /// 2. `pos` has not been moved backwards with `SourcePosition::sub`. + #[inline] + pub(super) unsafe fn str_from_current_to_pos_unchecked(&self, pos: SourcePosition) -> &'a str { + // SAFETY: Caller guarantees `pos` is not before current position of `Source`. + // `self.ptr` is always a valid `SourcePosition` due to invariants of `Source`. + self.str_between_positions_unchecked(SourcePosition::new(self.ptr), pos) + } + + /// Get string slice from a `SourcePosition` up to the end of `Source`. + #[inline] + pub(super) fn str_from_pos_to_end(&self, pos: SourcePosition) -> &'a str { + // SAFETY: Invariants of `SourcePosition` is that it cannot be after end of `Source`, + // and always on a UTF-8 character boundary. + // `self.end` is always a valid `SourcePosition` due to invariants of `Source`. + unsafe { self.str_between_positions_unchecked(pos, SourcePosition::new(self.end)) } + } + + /// Get string slice of source between 2 `SourcePosition`s, without checks. + /// + /// # SAFETY + /// `start` must not be after `end`. + #[inline] + pub(super) unsafe fn str_between_positions_unchecked( + &self, + start: SourcePosition, + end: SourcePosition, + ) -> &'a str { + debug_assert!(start.ptr <= end.ptr); + debug_assert!(start.ptr >= self.start); + debug_assert!(end.ptr <= self.end); + + // SAFETY: Caller guarantees `start` is not after `end`. // `SourcePosition`s can only be created from a `Source`. // `Source::new` takes a `UniquePromise`, which guarantees that it's the only `Source` // in existence on this thread. `Source` is not `Sync` or `Send`, so no possibility another // `Source` originated on another thread can "jump" onto this one. // This is sufficient to guarantee that any `SourcePosition` that parser/lexer holds must be - // from this `Source`, therefore `pos.ptr` and `self.ptr` must both be within the same allocation - // and derived from the same original pointer. + // from this `Source`, therefore `start.ptr` and `end.ptr` must both be within the same + // allocation, and derived from the same original pointer. // Invariants of `Source` and `SourcePosition` types guarantee that both are positioned // on UTF-8 character boundaries. So slicing source text between these 2 points will always // yield a valid UTF-8 string. - debug_assert!(pos.ptr <= self.ptr); - let len = self.ptr as usize - pos.addr(); - let slice = slice::from_raw_parts(pos.ptr, len); + let len = end.addr() - start.addr(); + let slice = slice::from_raw_parts(start.ptr, len); std::str::from_utf8_unchecked(slice) } - /// Get string slice from a `SourcePosition` up to the end of `Source`. - #[inline] - pub(super) fn str_from_pos_to_end(&self, pos: SourcePosition) -> &'a str { - // SAFETY: Invariants of `SourcePosition` is that it cannot be after end of `Source`, - // and always on a UTF-8 character boundary - unsafe { - let len = self.end as usize - pos.addr(); - let slice = slice::from_raw_parts(pos.ptr, len); - std::str::from_utf8_unchecked(slice) - } - } - /// Get current position in source, relative to start of source. #[allow(clippy::cast_possible_truncation)] #[inline] diff --git a/crates/oxc_parser/src/lexer/string_builder.rs b/crates/oxc_parser/src/lexer/string_builder.rs deleted file mode 100644 index 3b6961c9fa612..0000000000000 --- a/crates/oxc_parser/src/lexer/string_builder.rs +++ /dev/null @@ -1,74 +0,0 @@ -// Copied from https://github.com/mozilla-spidermonkey/jsparagus/blob/master/crates/parser/src/lexer.rs#L2256 - -use oxc_allocator::String; - -use crate::lexer::Lexer; - -pub struct AutoCow<'a> { - pub start: &'a str, - pub value: Option>, -} - -impl<'a> AutoCow<'a> { - pub fn new(lexer: &Lexer<'a>) -> Self { - let start = lexer.remaining(); - AutoCow { start, value: None } - } - - // Push a char that matches `lexer.next_char()`. - pub fn push_matching(&mut self, c: char) { - if let Some(text) = &mut self.value { - text.push(c); - } - } - - // Push a different character than `lexer.next_char()`. - // force_allocation_without_current_ascii_char must be called before this. - pub fn push_different(&mut self, c: char) { - debug_assert!(self.value.is_some()); - self.value.as_mut().unwrap().push(c); - } - - // Force allocation of a String, excluding the current ASCII character, - // and return the reference to it - pub fn get_mut_string_without_current_ascii_char<'b>( - &'b mut self, - lexer: &Lexer<'a>, - ) -> &'b mut String<'a> { - self.force_allocation_without_current_ascii_char(lexer); - self.value.as_mut().unwrap() - } - - // Force allocation of a String, excluding the current ASCII character. - pub fn force_allocation_without_current_ascii_char(&mut self, lexer: &Lexer<'a>) { - if self.value.is_some() { - return; - } - self.value = Some(String::from_str_in( - &self.start[..self.start.len() - lexer.remaining().len() - 1], - lexer.allocator, - )); - } - - // Check if the string contains a different character, such as an escape sequence - pub fn has_escape(&self) -> bool { - self.value.is_some() - } - - // TODO: Delete this if not using it - #[allow(dead_code)] - pub fn finish(mut self, lexer: &Lexer<'a>) -> &'a str { - match self.value.take() { - Some(s) => s.into_bump_str(), - None => &self.start[..self.start.len() - lexer.remaining().len()], - } - } - - // Just like finish, but without pushing current char. - pub fn finish_without_push(mut self, lexer: &Lexer<'a>) -> &'a str { - match self.value.take() { - Some(s) => s.into_bump_str(), - None => &self.start[..self.start.len() - lexer.remaining().len() - 1], - } - } -} diff --git a/crates/oxc_parser/src/lexer/template.rs b/crates/oxc_parser/src/lexer/template.rs index 7ebb41943d327..bbafcbd958708 100644 --- a/crates/oxc_parser/src/lexer/template.rs +++ b/crates/oxc_parser/src/lexer/template.rs @@ -1,47 +1,306 @@ -use super::{AutoCow, Kind, Lexer, Token}; +use super::{ + cold_branch, + search::{byte_search, safe_byte_match_table, SafeByteMatchTable}, + Kind, Lexer, SourcePosition, Token, +}; use crate::diagnostics; -use oxc_syntax::identifier::{CR, LF}; +use std::cmp::max; + +use oxc_allocator::String; + +const MIN_ESCAPED_TEMPLATE_LIT_LEN: usize = 16; + +static TEMPLATE_LITERAL_TABLE: SafeByteMatchTable = + safe_byte_match_table!(|b| matches!(b, b'$' | b'`' | b'\r' | b'\\')); impl<'a> Lexer<'a> { /// 12.8.6 Template Literal Lexical Components + + /// Read template literal component. pub(super) fn read_template_literal(&mut self, substitute: Kind, tail: Kind) -> Kind { - let mut builder = AutoCow::new(self); - let mut is_valid_escape_sequence = true; - while let Some(c) = self.next_char() { - match c { - '$' if self.peek() == Some('{') => { - self.save_template_string( - is_valid_escape_sequence, - builder.has_escape(), - builder.finish_without_push(self), - ); - self.consume_char(); - return substitute; - } - '`' => { - self.save_template_string( - is_valid_escape_sequence, - builder.has_escape(), - builder.finish_without_push(self), - ); - return tail; - } - CR => { - builder.force_allocation_without_current_ascii_char(self); - if self.next_eq(LF) { - builder.push_different(LF); + byte_search! { + lexer: self, + table: TEMPLATE_LITERAL_TABLE, + continue_if: |next_byte, pos| { + match next_byte { + b'$' => { + // SAFETY: Next byte is `$` which is ASCII, so after it is a UTF-8 char boundary + let after_dollar = unsafe { pos.add(1) }; + if after_dollar.addr() < self.source.end_addr() { + // If `${`, exit. + // SAFETY: Have checked there's at least 1 further byte to read. + if unsafe { after_dollar.read() } == b'{' { + // Consume `${` and exit. + // SAFETY: Consuming `${` leaves `pos` on a UTF-8 char boundary. + self.source.set_position(unsafe { after_dollar.add(1) }); + return substitute; + } + // Not `${`. Continue searching. + true + } else { + // This is last byte in file. Continue to `handle_eof`. + // This is illegal in valid JS, so mark this branch cold. + cold_branch(|| true) + } + }, + b'`' => { + // Consume b'`' and exit. + // SAFETY: Char at `pos` is '`', so `pos + 1` is a UTF-8 char boundary. + let after_backtick = unsafe { pos.add(1) }; + self.source.set_position(after_backtick); + return tail; + }, + b'\r' => { + // SAFETY: Byte at `pos` is `\r`. + // `pos` has only been advanced relative to `self.source.position()`. + return unsafe { self.template_literal_carriage_return(pos, substitute, tail) }; + } + _ => { + // `TEMPLATE_LITERAL_TABLE` only matches `$`, '`', `\r` and `\` + debug_assert!(next_byte == b'\\'); + // SAFETY: Byte at `pos` is `\`. + // `pos` has only been advanced relative to `self.source.position()`. + return unsafe { self.template_literal_backslash(pos, substitute, tail) }; } } - '\\' => { - let text = builder.get_mut_string_without_current_ascii_char(self); - self.read_string_escape_sequence(text, true, &mut is_valid_escape_sequence); - } - _ => builder.push_matching(c), - } + }, + handle_match: |_next_byte, _start| { + // TODO: This should be `unreachable!()` + Kind::Undetermined + }, + handle_eof: |_start| { + self.error(diagnostics::UnterminatedString(self.unterminated_range())); + Kind::Undetermined + }, + }; + } + + /// Consume rest of template literal after a `\r` is found. + /// + /// # SAFETY + /// * Byte at `pos` must be `\r`. + /// * `pos` must not be before `self.source.position()`. + unsafe fn template_literal_carriage_return( + &mut self, + mut pos: SourcePosition<'a>, + substitute: Kind, + tail: Kind, + ) -> Kind { + // Create arena string to hold modified template literal, containing up to before `\r`. + // SAFETY: Caller guarantees `pos` is not before `self.source.position()`. + let str = self.template_literal_create_string(pos); + + // Skip `\r`. + // SAFETY: Caller guarantees byte at `pos` is `\r`, so `pos + 1` is a UTF-8 char boundary. + pos = pos.add(1); + + // If at EOF, exit. This illegal in valid JS, so cold branch. + if pos.addr() == self.source.end_addr() { + return cold_branch(|| { + self.source.advance_to_end(); + self.error(diagnostics::UnterminatedString(self.unterminated_range())); + Kind::Undetermined + }); + } + + // Start next chunk after `\r` + let chunk_start = pos; + + // If next char is `\n`, start next search after it. + // `\n` is first char of next chunk, so it'll get added to `str` when chunk is pushed. + // SAFETY: Have checked not at EOF. + if pos.read() == b'\n' { + // SAFETY: `\n` is ASCII, so advancing past it leaves `pos` on a UTF-8 char boundary + pos = pos.add(1); } - self.error(diagnostics::UnterminatedString(self.unterminated_range())); - Kind::Undetermined + + self.template_literal_different(str, pos, chunk_start, true, substitute, tail) + } + + /// Consume rest of template literal after a `\` escape is found. + /// + /// # SAFETY + /// * Byte at `pos` must be `\`. + /// * `pos` must not be before `self.source.position()`. + unsafe fn template_literal_backslash( + &mut self, + pos: SourcePosition<'a>, + substitute: Kind, + tail: Kind, + ) -> Kind { + // Create arena string to hold modified template literal, containing up to before `\`. + // SAFETY: Caller guarantees `pos` is not before `self.source.position()`. + let mut str = self.template_literal_create_string(pos); + + // Decode escape sequence into `str`. + // `read_string_escape_sequence` expects `self.source` to be positioned after `\`. + // SAFETY: Caller guarantees next byte is `\`, which is ASCII, so `pos + 1` is UTF-8 char boundary. + let after_backslash = pos.add(1); + self.source.set_position(after_backslash); + + let mut is_valid_escape_sequence = true; + self.read_string_escape_sequence(&mut str, true, &mut is_valid_escape_sequence); + + // Continue search after escape + let after_escape = self.source.position(); + // SAFETY: `pos` and `chunk_start` are the same + self.template_literal_different( + str, + after_escape, + after_escape, + is_valid_escape_sequence, + substitute, + tail, + ) + } + + /// Create arena string for modified template literal, containing the template literal up to `pos`. + /// # SAFETY + /// `pos` must not be before `self.source.position()` + unsafe fn template_literal_create_string(&self, pos: SourcePosition) -> String<'a> { + // Create arena string to hold modified template literal. + // We don't know how long template literal will end up being. Take a guess that total length + // will be double what we've seen so far, or `MIN_ESCAPED_TEMPLATE_LIT_LEN` minimum. + // SAFETY: Caller guarantees `pos` is not before `self.source.position()`. + let so_far = self.source.str_from_current_to_pos_unchecked(pos); + let capacity = max(so_far.len() * 2, MIN_ESCAPED_TEMPLATE_LIT_LEN); + let mut str = String::with_capacity_in(capacity, self.allocator); + str.push_str(so_far); + str + } + + /// Process template literal after `\n` or `\` found. + /// # SAFETY + /// `chunk_start` must not be after `pos`. + unsafe fn template_literal_different( + &mut self, + mut str: String<'a>, + pos: SourcePosition<'a>, + mut chunk_start: SourcePosition<'a>, + mut is_valid_escape_sequence: bool, + substitute: Kind, + tail: Kind, + ) -> Kind { + byte_search! { + lexer: self, + table: TEMPLATE_LITERAL_TABLE, + start: pos, + continue_if: |next_byte, pos| { + match next_byte { + b'$' => { + // SAFETY: Next byte is `$` which is ASCII, so after it is a UTF-8 char boundary + let after_dollar = pos.add(1); + if after_dollar.addr() < self.source.end_addr() { + // If `${`, exit. + // SAFETY: Have checked there's at least 1 further byte to read. + if unsafe { after_dollar.read() } == b'{' { + // Add last chunk to `str` and record string. + // SAFETY: TODO + let chunk = self.source.str_between_positions_unchecked(chunk_start, pos); + str.push_str(chunk); + self.save_template_string( + is_valid_escape_sequence, + str.into_bump_str(), + ); + + // Consume `${` and exit. + // SAFETY: Consuming `${` leaves `pos` on a UTF-8 char boundary. + self.source.set_position(unsafe { after_dollar.add(1) }); + return substitute; + } + // Not `${`. Continue searching. + true + } else { + // This is last byte in file. Continue to `handle_eof`. + // This is illegal in valid JS, so mark this branch cold. + cold_branch(|| true) + } + }, + b'`' => { + // Add last chunk to `str` and record string. + // SAFETY: TODO + let chunk = self.source.str_between_positions_unchecked(chunk_start, pos); + str.push_str(chunk); + self.save_template_string( + is_valid_escape_sequence, + str.into_bump_str(), + ); + + // Consume b'`' and exit. + // SAFETY: Byte at `pos` is '`', so `pos + 1` is a UTF-8 char boundary. + let after_backtick = pos.add(1); + self.source.set_position(after_backtick); + return tail; + }, + b'\r' => { + // Add before `\r` to `str`. + // SAFETY: TODO + let chunk = self.source.str_between_positions_unchecked(chunk_start, pos); + str.push_str(chunk); + + // Set next chunk to start after `\r`. + // SAFETY: TODO + chunk_start = pos.add(1); + + if chunk_start.addr() < self.source.end_addr() { + // If next char is `\n`, start next search after it. + // NB: `byte_search!` macro already advances `pos` by 1, so only advance + // by 1 here, so that in total we skip 2 bytes for `\r\n`. + // No need to push `\n` to `str`, as it's 1st char of next chunk, + // and will be added to `str` when next chunk is pushed. + if chunk_start.read() == b'\n' { + // SAFETY: TODO + pos = chunk_start; + } + true + } else { + // This is last byte in file. Continue to `handle_eof`. + // This is illegal in valid JS, so mark this branch cold. + cold_branch(|| true) + } + } + _ => { + // `TEMPLATE_LITERAL_TABLE` only matches `$`, '`', `\r` and `\` + debug_assert!(next_byte == b'\\'); + + // Add chunk before escape to `str`. + // SAFETY: TODO + let chunk = self.source.str_between_positions_unchecked(chunk_start, pos); + str.push_str(chunk); + + // Decode escape sequence into `str`. + // `read_string_escape_sequence` expects `self.source` to be positioned after `\`. + // SAFETY: Next byte is `\`, which is ASCII, so `pos + 1` is UTF-8 char boundary. + let after_backslash = pos.add(1); + self.source.set_position(after_backslash); + self.read_string_escape_sequence(&mut str, true, &mut is_valid_escape_sequence); + + // Start next chunk after escape sequence + chunk_start = self.source.position(); + + // Continue search after escape sequence. + // NB: `byte_search!` macro increments `pos`, so need to subtract 1 here + // to counteract that. + // SAFETY: Added 1 to `pos` above, and `read_string_escape_sequence` only + // advances `self.source`, so subtracting 1 again is within bounds. + // TODO: This isn't good. It relies on behavior of `read_string_escape_sequence`, + // which makes no promise not to rewind `Source`. + pos = chunk_start.sub(1); + + true + } + } + }, + handle_match: |_next_byte, _start| { + // TODO: This should be `unreachable!()` + Kind::Undetermined + }, + handle_eof: |_start| { + self.error(diagnostics::UnterminatedString(self.unterminated_range())); + Kind::Undetermined + }, + }; } /// Re-tokenize the current `}` token for `TemplateSubstitutionTail` @@ -53,16 +312,8 @@ impl<'a> Lexer<'a> { self.finish_next(kind) } - /// Save the template if it is escaped - fn save_template_string( - &mut self, - is_valid_escape_sequence: bool, - has_escape: bool, - s: &'a str, - ) { - if !has_escape { - return; - } + /// Save template string + fn save_template_string(&mut self, is_valid_escape_sequence: bool, s: &'a str) { self.escaped_templates.insert(self.token.start, is_valid_escape_sequence.then_some(s)); self.token.escaped = true; }