From 26966190d826f08516f43ae84b1610fc20fcd674 Mon Sep 17 00:00:00 2001
From: overlookmotel <theoverlookmotel@gmail.com>
Date: Wed, 28 Feb 2024 14:38:22 +0000
Subject: [PATCH] perf(parser): faster lexing template strings

---
 crates/oxc_parser/src/lexer/mod.rs            |   2 -
 crates/oxc_parser/src/lexer/source.rs         |  65 +++-
 crates/oxc_parser/src/lexer/string_builder.rs |  74 ----
 crates/oxc_parser/src/lexer/template.rs       | 341 +++++++++++++++---
 4 files changed, 343 insertions(+), 139 deletions(-)
 delete mode 100644 crates/oxc_parser/src/lexer/string_builder.rs

diff --git a/crates/oxc_parser/src/lexer/mod.rs b/crates/oxc_parser/src/lexer/mod.rs
index 4502aa3134d00..79ba772cccd57 100644
--- a/crates/oxc_parser/src/lexer/mod.rs
+++ b/crates/oxc_parser/src/lexer/mod.rs
@@ -19,7 +19,6 @@ mod regex;
 mod search;
 mod source;
 mod string;
-mod string_builder;
 mod template;
 mod token;
 mod trivia_builder;
@@ -38,7 +37,6 @@ use oxc_span::{SourceType, Span};
 use self::{
     byte_handlers::handle_byte,
     source::{Source, SourcePosition},
-    string_builder::AutoCow,
     trivia_builder::TriviaBuilder,
 };
 pub use self::{
diff --git a/crates/oxc_parser/src/lexer/source.rs b/crates/oxc_parser/src/lexer/source.rs
index 689067232545b..0ec4a8875f1ab 100644
--- a/crates/oxc_parser/src/lexer/source.rs
+++ b/crates/oxc_parser/src/lexer/source.rs
@@ -207,7 +207,7 @@ impl<'a> Source<'a> {
     /// Get string slice from a `SourcePosition` up to the current position of `Source`,
     /// without checks.
     ///
-    /// SAFETY:
+    /// # SAFETY
     /// `pos` must not be after current position of `Source`.
     /// This is always the case if both:
     /// 1. `Source::set_position` has not been called since `pos` was created.
@@ -215,34 +215,63 @@ impl<'a> Source<'a> {
     #[inline]
     pub(super) unsafe fn str_from_pos_to_current_unchecked(&self, pos: SourcePosition) -> &'a str {
         // SAFETY: Caller guarantees `pos` is not after current position of `Source`.
+        // `self.ptr` is always a valid `SourcePosition` due to invariants of `Source`.
+        self.str_between_positions_unchecked(pos, SourcePosition::new(self.ptr))
+    }
+
+    /// Get string slice from current position of `Source` up to a `SourcePosition`, without checks.
+    ///
+    /// # SAFETY
+    /// `pos` must not be before current position of `Source`.
+    /// This is always the case if both:
+    /// 1. `Source::set_position` has not been called since `pos` was created.
+    /// 2. `pos` has not been moved backwards with `SourcePosition::sub`.
+    #[inline]
+    pub(super) unsafe fn str_from_current_to_pos_unchecked(&self, pos: SourcePosition) -> &'a str {
+        // SAFETY: Caller guarantees `pos` is not before current position of `Source`.
+        // `self.ptr` is always a valid `SourcePosition` due to invariants of `Source`.
+        self.str_between_positions_unchecked(SourcePosition::new(self.ptr), pos)
+    }
+
+    /// Get string slice from a `SourcePosition` up to the end of `Source`.
+    #[inline]
+    pub(super) fn str_from_pos_to_end(&self, pos: SourcePosition) -> &'a str {
+        // SAFETY: Invariants of `SourcePosition` is that it cannot be after end of `Source`,
+        // and always on a UTF-8 character boundary.
+        // `self.end` is always a valid `SourcePosition` due to invariants of `Source`.
+        unsafe { self.str_between_positions_unchecked(pos, SourcePosition::new(self.end)) }
+    }
+
+    /// Get string slice of source between 2 `SourcePosition`s, without checks.
+    ///
+    /// # SAFETY
+    /// `start` must not be after `end`.
+    #[inline]
+    pub(super) unsafe fn str_between_positions_unchecked(
+        &self,
+        start: SourcePosition,
+        end: SourcePosition,
+    ) -> &'a str {
+        debug_assert!(start.ptr <= end.ptr);
+        debug_assert!(start.ptr >= self.start);
+        debug_assert!(end.ptr <= self.end);
+
+        // SAFETY: Caller guarantees `start` is not after `end`.
         // `SourcePosition`s can only be created from a `Source`.
         // `Source::new` takes a `UniquePromise`, which guarantees that it's the only `Source`
         // in existence on this thread. `Source` is not `Sync` or `Send`, so no possibility another
         // `Source` originated on another thread can "jump" onto this one.
         // This is sufficient to guarantee that any `SourcePosition` that parser/lexer holds must be
-        // from this `Source`, therefore `pos.ptr` and `self.ptr` must both be within the same allocation
-        // and derived from the same original pointer.
+        // from this `Source`, therefore `start.ptr` and `end.ptr` must both be within the same
+        // allocation, and derived from the same original pointer.
         // Invariants of `Source` and `SourcePosition` types guarantee that both are positioned
         // on UTF-8 character boundaries. So slicing source text between these 2 points will always
         // yield a valid UTF-8 string.
-        debug_assert!(pos.ptr <= self.ptr);
-        let len = self.ptr as usize - pos.addr();
-        let slice = slice::from_raw_parts(pos.ptr, len);
+        let len = end.addr() - start.addr();
+        let slice = slice::from_raw_parts(start.ptr, len);
         std::str::from_utf8_unchecked(slice)
     }
 
-    /// Get string slice from a `SourcePosition` up to the end of `Source`.
-    #[inline]
-    pub(super) fn str_from_pos_to_end(&self, pos: SourcePosition) -> &'a str {
-        // SAFETY: Invariants of `SourcePosition` is that it cannot be after end of `Source`,
-        // and always on a UTF-8 character boundary
-        unsafe {
-            let len = self.end as usize - pos.addr();
-            let slice = slice::from_raw_parts(pos.ptr, len);
-            std::str::from_utf8_unchecked(slice)
-        }
-    }
-
     /// Get current position in source, relative to start of source.
     #[allow(clippy::cast_possible_truncation)]
     #[inline]
diff --git a/crates/oxc_parser/src/lexer/string_builder.rs b/crates/oxc_parser/src/lexer/string_builder.rs
deleted file mode 100644
index 3b6961c9fa612..0000000000000
--- a/crates/oxc_parser/src/lexer/string_builder.rs
+++ /dev/null
@@ -1,74 +0,0 @@
-// Copied from https://github.com/mozilla-spidermonkey/jsparagus/blob/master/crates/parser/src/lexer.rs#L2256
-
-use oxc_allocator::String;
-
-use crate::lexer::Lexer;
-
-pub struct AutoCow<'a> {
-    pub start: &'a str,
-    pub value: Option<String<'a>>,
-}
-
-impl<'a> AutoCow<'a> {
-    pub fn new(lexer: &Lexer<'a>) -> Self {
-        let start = lexer.remaining();
-        AutoCow { start, value: None }
-    }
-
-    // Push a char that matches `lexer.next_char()`.
-    pub fn push_matching(&mut self, c: char) {
-        if let Some(text) = &mut self.value {
-            text.push(c);
-        }
-    }
-
-    // Push a different character than `lexer.next_char()`.
-    // force_allocation_without_current_ascii_char must be called before this.
-    pub fn push_different(&mut self, c: char) {
-        debug_assert!(self.value.is_some());
-        self.value.as_mut().unwrap().push(c);
-    }
-
-    // Force allocation of a String, excluding the current ASCII character,
-    // and return the reference to it
-    pub fn get_mut_string_without_current_ascii_char<'b>(
-        &'b mut self,
-        lexer: &Lexer<'a>,
-    ) -> &'b mut String<'a> {
-        self.force_allocation_without_current_ascii_char(lexer);
-        self.value.as_mut().unwrap()
-    }
-
-    // Force allocation of a String, excluding the current ASCII character.
-    pub fn force_allocation_without_current_ascii_char(&mut self, lexer: &Lexer<'a>) {
-        if self.value.is_some() {
-            return;
-        }
-        self.value = Some(String::from_str_in(
-            &self.start[..self.start.len() - lexer.remaining().len() - 1],
-            lexer.allocator,
-        ));
-    }
-
-    // Check if the string contains a different character, such as an escape sequence
-    pub fn has_escape(&self) -> bool {
-        self.value.is_some()
-    }
-
-    // TODO: Delete this if not using it
-    #[allow(dead_code)]
-    pub fn finish(mut self, lexer: &Lexer<'a>) -> &'a str {
-        match self.value.take() {
-            Some(s) => s.into_bump_str(),
-            None => &self.start[..self.start.len() - lexer.remaining().len()],
-        }
-    }
-
-    // Just like finish, but without pushing current char.
-    pub fn finish_without_push(mut self, lexer: &Lexer<'a>) -> &'a str {
-        match self.value.take() {
-            Some(s) => s.into_bump_str(),
-            None => &self.start[..self.start.len() - lexer.remaining().len() - 1],
-        }
-    }
-}
diff --git a/crates/oxc_parser/src/lexer/template.rs b/crates/oxc_parser/src/lexer/template.rs
index 7ebb41943d327..bbafcbd958708 100644
--- a/crates/oxc_parser/src/lexer/template.rs
+++ b/crates/oxc_parser/src/lexer/template.rs
@@ -1,47 +1,306 @@
-use super::{AutoCow, Kind, Lexer, Token};
+use super::{
+    cold_branch,
+    search::{byte_search, safe_byte_match_table, SafeByteMatchTable},
+    Kind, Lexer, SourcePosition, Token,
+};
 use crate::diagnostics;
 
-use oxc_syntax::identifier::{CR, LF};
+use std::cmp::max;
+
+use oxc_allocator::String;
+
+const MIN_ESCAPED_TEMPLATE_LIT_LEN: usize = 16;
+
+static TEMPLATE_LITERAL_TABLE: SafeByteMatchTable =
+    safe_byte_match_table!(|b| matches!(b, b'$' | b'`' | b'\r' | b'\\'));
 
 impl<'a> Lexer<'a> {
     /// 12.8.6 Template Literal Lexical Components
+
+    /// Read template literal component.
     pub(super) fn read_template_literal(&mut self, substitute: Kind, tail: Kind) -> Kind {
-        let mut builder = AutoCow::new(self);
-        let mut is_valid_escape_sequence = true;
-        while let Some(c) = self.next_char() {
-            match c {
-                '$' if self.peek() == Some('{') => {
-                    self.save_template_string(
-                        is_valid_escape_sequence,
-                        builder.has_escape(),
-                        builder.finish_without_push(self),
-                    );
-                    self.consume_char();
-                    return substitute;
-                }
-                '`' => {
-                    self.save_template_string(
-                        is_valid_escape_sequence,
-                        builder.has_escape(),
-                        builder.finish_without_push(self),
-                    );
-                    return tail;
-                }
-                CR => {
-                    builder.force_allocation_without_current_ascii_char(self);
-                    if self.next_eq(LF) {
-                        builder.push_different(LF);
+        byte_search! {
+            lexer: self,
+            table: TEMPLATE_LITERAL_TABLE,
+            continue_if: |next_byte, pos| {
+                match next_byte {
+                    b'$' => {
+                        // SAFETY: Next byte is `$` which is ASCII, so after it is a UTF-8 char boundary
+                        let after_dollar = unsafe { pos.add(1) };
+                        if after_dollar.addr() < self.source.end_addr() {
+                            // If `${`, exit.
+                            // SAFETY: Have checked there's at least 1 further byte to read.
+                            if unsafe { after_dollar.read() } == b'{' {
+                                // Consume `${` and exit.
+                                // SAFETY: Consuming `${` leaves `pos` on a UTF-8 char boundary.
+                                self.source.set_position(unsafe { after_dollar.add(1) });
+                                return substitute;
+                            }
+                            // Not `${`. Continue searching.
+                            true
+                        } else {
+                            // This is last byte in file. Continue to `handle_eof`.
+                            // This is illegal in valid JS, so mark this branch cold.
+                            cold_branch(|| true)
+                        }
+                    },
+                    b'`' => {
+                        // Consume b'`' and exit.
+                        // SAFETY: Char at `pos` is '`', so `pos + 1` is a UTF-8 char boundary.
+                        let after_backtick = unsafe { pos.add(1) };
+                        self.source.set_position(after_backtick);
+                        return tail;
+                    },
+                    b'\r' => {
+                        // SAFETY: Byte at `pos` is `\r`.
+                        // `pos` has only been advanced relative to `self.source.position()`.
+                        return unsafe { self.template_literal_carriage_return(pos, substitute, tail) };
+                    }
+                    _ => {
+                        // `TEMPLATE_LITERAL_TABLE` only matches `$`, '`', `\r` and `\`
+                        debug_assert!(next_byte == b'\\');
+                        // SAFETY: Byte at `pos` is `\`.
+                        // `pos` has only been advanced relative to `self.source.position()`.
+                        return unsafe { self.template_literal_backslash(pos, substitute, tail) };
                     }
                 }
-                '\\' => {
-                    let text = builder.get_mut_string_without_current_ascii_char(self);
-                    self.read_string_escape_sequence(text, true, &mut is_valid_escape_sequence);
-                }
-                _ => builder.push_matching(c),
-            }
+            },
+            handle_match: |_next_byte, _start| {
+                // TODO: This should be `unreachable!()`
+                Kind::Undetermined
+            },
+            handle_eof: |_start| {
+                self.error(diagnostics::UnterminatedString(self.unterminated_range()));
+                Kind::Undetermined
+            },
+        };
+    }
+
+    /// Consume rest of template literal after a `\r` is found.
+    ///
+    /// # SAFETY
+    /// * Byte at `pos` must be `\r`.
+    /// * `pos` must not be before `self.source.position()`.
+    unsafe fn template_literal_carriage_return(
+        &mut self,
+        mut pos: SourcePosition<'a>,
+        substitute: Kind,
+        tail: Kind,
+    ) -> Kind {
+        // Create arena string to hold modified template literal, containing up to before `\r`.
+        // SAFETY: Caller guarantees `pos` is not before `self.source.position()`.
+        let str = self.template_literal_create_string(pos);
+
+        // Skip `\r`.
+        // SAFETY: Caller guarantees byte at `pos` is `\r`, so `pos + 1` is a UTF-8 char boundary.
+        pos = pos.add(1);
+
+        // If at EOF, exit. This illegal in valid JS, so cold branch.
+        if pos.addr() == self.source.end_addr() {
+            return cold_branch(|| {
+                self.source.advance_to_end();
+                self.error(diagnostics::UnterminatedString(self.unterminated_range()));
+                Kind::Undetermined
+            });
+        }
+
+        // Start next chunk after `\r`
+        let chunk_start = pos;
+
+        // If next char is `\n`, start next search after it.
+        // `\n` is first char of next chunk, so it'll get added to `str` when chunk is pushed.
+        // SAFETY: Have checked not at EOF.
+        if pos.read() == b'\n' {
+            // SAFETY: `\n` is ASCII, so advancing past it leaves `pos` on a UTF-8 char boundary
+            pos = pos.add(1);
         }
-        self.error(diagnostics::UnterminatedString(self.unterminated_range()));
-        Kind::Undetermined
+
+        self.template_literal_different(str, pos, chunk_start, true, substitute, tail)
+    }
+
+    /// Consume rest of template literal after a `\` escape is found.
+    ///
+    /// # SAFETY
+    /// * Byte at `pos` must be `\`.
+    /// * `pos` must not be before `self.source.position()`.
+    unsafe fn template_literal_backslash(
+        &mut self,
+        pos: SourcePosition<'a>,
+        substitute: Kind,
+        tail: Kind,
+    ) -> Kind {
+        // Create arena string to hold modified template literal, containing up to before `\`.
+        // SAFETY: Caller guarantees `pos` is not before `self.source.position()`.
+        let mut str = self.template_literal_create_string(pos);
+
+        // Decode escape sequence into `str`.
+        // `read_string_escape_sequence` expects `self.source` to be positioned after `\`.
+        // SAFETY: Caller guarantees next byte is `\`, which is ASCII, so `pos + 1` is UTF-8 char boundary.
+        let after_backslash = pos.add(1);
+        self.source.set_position(after_backslash);
+
+        let mut is_valid_escape_sequence = true;
+        self.read_string_escape_sequence(&mut str, true, &mut is_valid_escape_sequence);
+
+        // Continue search after escape
+        let after_escape = self.source.position();
+        // SAFETY: `pos` and `chunk_start` are the same
+        self.template_literal_different(
+            str,
+            after_escape,
+            after_escape,
+            is_valid_escape_sequence,
+            substitute,
+            tail,
+        )
+    }
+
+    /// Create arena string for modified template literal, containing the template literal up to `pos`.
+    /// # SAFETY
+    /// `pos` must not be before `self.source.position()`
+    unsafe fn template_literal_create_string(&self, pos: SourcePosition) -> String<'a> {
+        // Create arena string to hold modified template literal.
+        // We don't know how long template literal will end up being. Take a guess that total length
+        // will be double what we've seen so far, or `MIN_ESCAPED_TEMPLATE_LIT_LEN` minimum.
+        // SAFETY: Caller guarantees `pos` is not before `self.source.position()`.
+        let so_far = self.source.str_from_current_to_pos_unchecked(pos);
+        let capacity = max(so_far.len() * 2, MIN_ESCAPED_TEMPLATE_LIT_LEN);
+        let mut str = String::with_capacity_in(capacity, self.allocator);
+        str.push_str(so_far);
+        str
+    }
+
+    /// Process template literal after `\n` or `\` found.
+    /// # SAFETY
+    /// `chunk_start` must not be after `pos`.
+    unsafe fn template_literal_different(
+        &mut self,
+        mut str: String<'a>,
+        pos: SourcePosition<'a>,
+        mut chunk_start: SourcePosition<'a>,
+        mut is_valid_escape_sequence: bool,
+        substitute: Kind,
+        tail: Kind,
+    ) -> Kind {
+        byte_search! {
+            lexer: self,
+            table: TEMPLATE_LITERAL_TABLE,
+            start: pos,
+            continue_if: |next_byte, pos| {
+                match next_byte {
+                    b'$' => {
+                        // SAFETY: Next byte is `$` which is ASCII, so after it is a UTF-8 char boundary
+                        let after_dollar = pos.add(1);
+                        if after_dollar.addr() < self.source.end_addr() {
+                            // If `${`, exit.
+                            // SAFETY: Have checked there's at least 1 further byte to read.
+                            if unsafe { after_dollar.read() } == b'{' {
+                                // Add last chunk to `str` and record string.
+                                // SAFETY: TODO
+                                let chunk = self.source.str_between_positions_unchecked(chunk_start, pos);
+                                str.push_str(chunk);
+                                self.save_template_string(
+                                    is_valid_escape_sequence,
+                                    str.into_bump_str(),
+                                );
+
+                                // Consume `${` and exit.
+                                // SAFETY: Consuming `${` leaves `pos` on a UTF-8 char boundary.
+                                self.source.set_position(unsafe { after_dollar.add(1) });
+                                return substitute;
+                            }
+                            // Not `${`. Continue searching.
+                            true
+                        } else {
+                            // This is last byte in file. Continue to `handle_eof`.
+                            // This is illegal in valid JS, so mark this branch cold.
+                            cold_branch(|| true)
+                        }
+                    },
+                    b'`' => {
+                        // Add last chunk to `str` and record string.
+                        // SAFETY: TODO
+                        let chunk = self.source.str_between_positions_unchecked(chunk_start, pos);
+                        str.push_str(chunk);
+                        self.save_template_string(
+                            is_valid_escape_sequence,
+                            str.into_bump_str(),
+                        );
+
+                        // Consume b'`' and exit.
+                        // SAFETY: Byte at `pos` is '`', so `pos + 1` is a UTF-8 char boundary.
+                        let after_backtick = pos.add(1);
+                        self.source.set_position(after_backtick);
+                        return tail;
+                    },
+                    b'\r' => {
+                        // Add before `\r` to `str`.
+                        // SAFETY: TODO
+                        let chunk = self.source.str_between_positions_unchecked(chunk_start, pos);
+                        str.push_str(chunk);
+
+                        // Set next chunk to start after `\r`.
+                        // SAFETY: TODO
+                        chunk_start = pos.add(1);
+
+                        if chunk_start.addr() < self.source.end_addr() {
+                            // If next char is `\n`, start next search after it.
+                            // NB: `byte_search!` macro already advances `pos` by 1, so only advance
+                            // by 1 here, so that in total we skip 2 bytes for `\r\n`.
+                            // No need to push `\n` to `str`, as it's 1st char of next chunk,
+                            // and will be added to `str` when next chunk is pushed.
+                            if chunk_start.read() == b'\n' {
+                                // SAFETY: TODO
+                                pos = chunk_start;
+                            }
+                            true
+                        } else {
+                            // This is last byte in file. Continue to `handle_eof`.
+                            // This is illegal in valid JS, so mark this branch cold.
+                            cold_branch(|| true)
+                        }
+                    }
+                    _ => {
+                        // `TEMPLATE_LITERAL_TABLE` only matches `$`, '`', `\r` and `\`
+                        debug_assert!(next_byte == b'\\');
+
+                        // Add chunk before escape to `str`.
+                        // SAFETY: TODO
+                        let chunk = self.source.str_between_positions_unchecked(chunk_start, pos);
+                        str.push_str(chunk);
+
+                        // Decode escape sequence into `str`.
+                        // `read_string_escape_sequence` expects `self.source` to be positioned after `\`.
+                        // SAFETY: Next byte is `\`, which is ASCII, so `pos + 1` is UTF-8 char boundary.
+                        let after_backslash = pos.add(1);
+                        self.source.set_position(after_backslash);
+                        self.read_string_escape_sequence(&mut str, true, &mut is_valid_escape_sequence);
+
+                        // Start next chunk after escape sequence
+                        chunk_start = self.source.position();
+
+                        // Continue search after escape sequence.
+                        // NB: `byte_search!` macro increments `pos`, so need to subtract 1 here
+                        // to counteract that.
+                        // SAFETY: Added 1 to `pos` above, and `read_string_escape_sequence` only
+                        // advances `self.source`, so subtracting 1 again is within bounds.
+                        // TODO: This isn't good. It relies on behavior of `read_string_escape_sequence`,
+                        // which makes no promise not to rewind `Source`.
+                        pos = chunk_start.sub(1);
+
+                        true
+                    }
+                }
+            },
+            handle_match: |_next_byte, _start| {
+                // TODO: This should be `unreachable!()`
+                Kind::Undetermined
+            },
+            handle_eof: |_start| {
+                self.error(diagnostics::UnterminatedString(self.unterminated_range()));
+                Kind::Undetermined
+            },
+        };
     }
 
     /// Re-tokenize the current `}` token for `TemplateSubstitutionTail`
@@ -53,16 +312,8 @@ impl<'a> Lexer<'a> {
         self.finish_next(kind)
     }
 
-    /// Save the template if it is escaped
-    fn save_template_string(
-        &mut self,
-        is_valid_escape_sequence: bool,
-        has_escape: bool,
-        s: &'a str,
-    ) {
-        if !has_escape {
-            return;
-        }
+    /// Save template string
+    fn save_template_string(&mut self, is_valid_escape_sequence: bool, s: &'a str) {
         self.escaped_templates.insert(self.token.start, is_valid_escape_sequence.then_some(s));
         self.token.escaped = true;
     }