Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

perf(parser): faster lexing template strings #2539

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions crates/oxc_parser/src/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ mod regex;
mod search;
mod source;
mod string;
mod string_builder;
mod template;
mod token;
mod trivia_builder;
Expand All @@ -38,7 +37,6 @@ use oxc_span::{SourceType, Span};
use self::{
byte_handlers::handle_byte,
source::{Source, SourcePosition},
string_builder::AutoCow,
trivia_builder::TriviaBuilder,
};
pub use self::{
Expand Down
92 changes: 64 additions & 28 deletions crates/oxc_parser/src/lexer/source.rs
Original file line number Diff line number Diff line change
Expand Up @@ -109,9 +109,10 @@ impl<'a> Source<'a> {
// SAFETY: `start` and `end` are created from a `&str` in `Source::new`,
// so guaranteed to be start and end of a valid UTF-8 string
unsafe {
let len = self.end as usize - self.start as usize;
let slice = slice::from_raw_parts(self.start, len);
str::from_utf8_unchecked(slice)
self.str_between_positions_unchecked(
SourcePosition::new(self.start),
SourcePosition::new(self.end),
)
}
}

Expand All @@ -125,10 +126,10 @@ impl<'a> Source<'a> {
// Invariant of `Source` is that `ptr` is always on a UTF-8 character boundary,
// so slice from `ptr` to `end` will always be a valid UTF-8 string.
unsafe {
let len = self.end as usize - self.ptr as usize;
let slice = slice::from_raw_parts(self.ptr, len);
debug_assert!(slice.is_empty() || !is_utf8_cont_byte(slice[0]));
str::from_utf8_unchecked(slice)
self.str_between_positions_unchecked(
SourcePosition::new(self.ptr),
SourcePosition::new(self.end),
)
}
}

Expand Down Expand Up @@ -192,6 +193,7 @@ impl<'a> Source<'a> {
self.ptr = pos.ptr;
}

/// Advance `Source`'s cursor to end.
#[inline]
pub(super) fn advance_to_end(&mut self) {
self.ptr = self.end;
Expand All @@ -204,45 +206,79 @@ impl<'a> Source<'a> {
unsafe { self.str_from_pos_to_current_unchecked(pos) }
}

/// Get string slice from a `SourcePosition` up to the current position of `Source`,
/// without checks.
/// Get string slice from a `SourcePosition` up to current position of `Source`, without checks.
///
/// SAFETY:
/// # SAFETY
/// `pos` must not be after current position of `Source`.
/// This is always the case if both:
/// 1. `Source::set_position` has not been called since `pos` was created.
/// 2. `pos` has not been advanced with `SourcePosition::add`.
#[inline]
pub(super) unsafe fn str_from_pos_to_current_unchecked(&self, pos: SourcePosition) -> &'a str {
// SAFETY: Caller guarantees `pos` is not after current position of `Source`.
// `self.ptr` is always a valid `SourcePosition` due to invariants of `Source`.
self.str_between_positions_unchecked(pos, SourcePosition::new(self.ptr))
}

/// Get string slice from current position of `Source` up to a `SourcePosition`, without checks.
///
/// # SAFETY
/// `pos` must not be before current position of `Source`.
/// This is always the case if both:
/// 1. `Source::set_position` has not been called since `pos` was created.
/// 2. `pos` has not been moved backwards with `SourcePosition::sub`.
#[inline]
pub(super) unsafe fn str_from_current_to_pos_unchecked(&self, pos: SourcePosition) -> &'a str {
// SAFETY: Caller guarantees `pos` is not before current position of `Source`.
// `self.ptr` is always a valid `SourcePosition` due to invariants of `Source`.
self.str_between_positions_unchecked(SourcePosition::new(self.ptr), pos)
}

/// Get string slice from a `SourcePosition` up to the end of `Source`.
#[inline]
pub(super) fn str_from_pos_to_end(&self, pos: SourcePosition) -> &'a str {
// SAFETY: Invariants of `SourcePosition` is that it cannot be after end of `Source`,
// and always on a UTF-8 character boundary.
// `self.end` is always a valid `SourcePosition` due to invariants of `Source`.
unsafe { self.str_between_positions_unchecked(pos, SourcePosition::new(self.end)) }
}

/// Get string slice of source between 2 `SourcePosition`s, without checks.
///
/// # SAFETY
/// `start` must not be after `end`.
#[inline]
pub(super) unsafe fn str_between_positions_unchecked(
&self,
start: SourcePosition,
end: SourcePosition,
) -> &'a str {
// Check `start` is not after `end`
debug_assert!(start.ptr <= end.ptr);
// Check `start` and `end` are within bounds of `Source`
debug_assert!(start.ptr >= self.start);
debug_assert!(end.ptr <= self.end);
// Check `start` and `end` are on UTF-8 character boundaries.
// SAFETY: Above assertions ensure `start` and `end` are valid to read from if not at EOF.
debug_assert!(start.ptr == self.end || !is_utf8_cont_byte(start.read()));
debug_assert!(end.ptr == self.end || !is_utf8_cont_byte(end.read()));

// SAFETY: Caller guarantees `start` is not after `end`.
// `SourcePosition`s can only be created from a `Source`.
// `Source::new` takes a `UniquePromise`, which guarantees that it's the only `Source`
// in existence on this thread. `Source` is not `Sync` or `Send`, so no possibility another
// `Source` originated on another thread can "jump" onto this one.
// This is sufficient to guarantee that any `SourcePosition` that parser/lexer holds must be
// from this `Source`, therefore `pos.ptr` and `self.ptr` must both be within the same allocation
// and derived from the same original pointer.
// from this `Source`, therefore `start.ptr` and `end.ptr` must both be within the same
// allocation, and derived from the same original pointer.
// Invariants of `Source` and `SourcePosition` types guarantee that both are positioned
// on UTF-8 character boundaries. So slicing source text between these 2 points will always
// yield a valid UTF-8 string.
debug_assert!(pos.ptr <= self.ptr);
let len = self.ptr as usize - pos.addr();
let slice = slice::from_raw_parts(pos.ptr, len);
let len = end.addr() - start.addr();
let slice = slice::from_raw_parts(start.ptr, len);
std::str::from_utf8_unchecked(slice)
}

/// Get string slice from a `SourcePosition` up to the end of `Source`.
#[inline]
pub(super) fn str_from_pos_to_end(&self, pos: SourcePosition) -> &'a str {
// SAFETY: Invariants of `SourcePosition` is that it cannot be after end of `Source`,
// and always on a UTF-8 character boundary
unsafe {
let len = self.end as usize - pos.addr();
let slice = slice::from_raw_parts(pos.ptr, len);
std::str::from_utf8_unchecked(slice)
}
}

/// Get current position in source, relative to start of source.
#[allow(clippy::cast_possible_truncation)]
#[inline]
Expand All @@ -267,7 +303,7 @@ impl<'a> Source<'a> {
/// * Moving back `n` bytes would not place current position on a UTF-8 character boundary.
#[inline]
pub(super) fn back(&mut self, n: usize) {
// This assertion is essential to ensure safety of `pos.read()` call below.
// This assertion is essential to ensure safety of `new_pos.read()` call below.
// Without this check, calling `back(0)` on an empty `Source` would cause reading
// out of bounds.
// Compiler should remove this assertion when inlining this function,
Expand Down
74 changes: 0 additions & 74 deletions crates/oxc_parser/src/lexer/string_builder.rs

This file was deleted.

Loading
Loading