Skip to content

Commit

Permalink
perf(parser): faster lexing regexes
Browse files Browse the repository at this point in the history
Consume line breaks
  • Loading branch information
overlookmotel committed Feb 29, 2024
1 parent 51e0e5c commit 379ff0d
Showing 1 changed file with 130 additions and 32 deletions.
162 changes: 130 additions & 32 deletions crates/oxc_parser/src/lexer/regex.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,25 @@
use super::{Kind, Lexer, RegExpFlags, Token};
use super::{
cold_branch,
search::{byte_search, safe_byte_match_table, SafeByteMatchTable},
Kind, Lexer, RegExpFlags, SourcePosition, Token,
};
use crate::diagnostics;

use oxc_syntax::identifier::is_line_terminator;
// Irregular line breaks - '\u{2028}' (LS) and '\u{2029}' (PS)
const LS_OR_PS_FIRST: u8 = 0xE2;
const LS_BYTES_2_AND_3: [u8; 2] = [0x80, 0xA8];
const PS_BYTES_2_AND_3: [u8; 2] = [0x80, 0xA9];

static REGEX_END_TABLE: SafeByteMatchTable = safe_byte_match_table!(|b| matches!(
b,
b'/' | b'[' | b']' | b'\\' | b'\r' | b'\n' | LS_OR_PS_FIRST
));

static LINE_BREAK_TABLE: SafeByteMatchTable =
safe_byte_match_table!(|b| matches!(b, b'\r' | b'\n' | LS_OR_PS_FIRST));

static MAYBE_REGEX_FLAG_TABLE: SafeByteMatchTable =
safe_byte_match_table!(|b| b.is_ascii_alphanumeric() || matches!(b, b'$' | b'_'));

impl<'a> Lexer<'a> {
/// Re-tokenize the current `/` or `/=` and return `RegExp`
Expand All @@ -25,47 +43,127 @@ impl<'a> Lexer<'a> {

/// 12.9.5 Regular Expression Literals
fn read_regex(&mut self) -> (u32, RegExpFlags) {
let mut in_escape = false;
let mut in_character_class = false;
loop {
match self.next_char() {
None => {
self.error(diagnostics::UnterminatedRegExp(self.unterminated_range()));
return (self.offset(), RegExpFlags::empty());
}
Some(c) if is_line_terminator(c) => {
self.error(diagnostics::UnterminatedRegExp(self.unterminated_range()));
#[allow(clippy::cast_possible_truncation)]
let pattern_end = self.offset() - c.len_utf8() as u32;
return (pattern_end, RegExpFlags::empty());
}
Some(c) => {
if in_escape {
in_escape = false;
} else if c == '/' && !in_character_class {
break;
} else if c == '[' {

byte_search! {
lexer: self,
table: REGEX_END_TABLE,
continue_if: (next_byte, pos) {
// Match found. Decide whether to continue searching.
match next_byte {
b'/' => {
if in_character_class {
true
} else {
// SAFETY: Next byte is `/` which is ASCII
return unsafe { self.read_regex_flags(pos) };
}
},
b'[' => {
in_character_class = true;
} else if c == '\\' {
in_escape = true;
} else if c == ']' {
true
}
b']' => {
in_character_class = false;
true
}
b'\\' => {
// SAFETY: Next byte is `\` which is ASCII, so +1 byte is a UTF-8 char boundary
let after_backslash = unsafe { pos.add(1) };
if after_backslash.addr() < self.source.end_addr() {
// SAFETY: Have checked not at EOF, so safe to read a byte
if LINE_BREAK_TABLE.matches(unsafe { after_backslash.read() }) {
// `\r`, `\n`, or first byte of PS/LS after backslash.
// Continue search, so that if it is a line break (at present could be
// some other Unicode char starting with same byte as PS/LS),
// then next turn of search will raise an error.
// If it's not a line break, search will continue.
// Line breaks are illegal in valid JS, and Unicode chars are rare,
// so cold branch.
cold_branch(|| true)
} else {
// Skip next byte.
// Macro will already advance 1 byte, so this advances 2 bytes total,
// past the `\` and the next byte. This may place `pos` in middle of
// a multi-byte Unicode character, but `REGEX_END_TABLE` doesn't match
// any UTF-8 continuation characters, so if so `pos` will end up
// on a UTF-8 char boundary again after next turn of the search.
pos = after_backslash;
true
}
} else {
// This is last byte in file. Continue to `handle_eof`.
// This is illegal in valid JS, so mark this branch cold.
cold_branch(|| true)
}
},
_ => cold_branch(|| {
// Likely line break.
// Line breaks are illegal in valid JS, and Unicode is also rare, so cold branch.
// SAFETY: This may place `pos` in middle of a UTF-8 char, but if so that's
// fixed below.
pos = unsafe { pos.add(1) };
if next_byte == LS_OR_PS_FIRST {
// SAFETY: Next byte is `0xE2` which is always 1st byte of a 3-byte UTF-8 char.
// So safe to read 2 bytes (we already skipped the `0xE2` byte).
let next2 = unsafe { pos.read2() };
if matches!(next2, LS_BYTES_2_AND_3 | PS_BYTES_2_AND_3) {
// Irregular line break. Consume it and stop searching.
// SAFETY: Irregular line breaks are 3-byte chars. We consumed 1 byte already.
pos = unsafe { pos.add(2) };
false
} else {
// Some other Unicode char beginning with `0xE2`, not a line break.
// Skip 3 bytes (already skipped 1, and macro skips 1 more, so skip 1 more
// here to make 3), and continue searching.
// SAFETY: `0xE2` is always 1st byte of a 3-byte UTF-8 char,
// so consuming 3 bytes will place `pos` on next UTF-8 char boundary.
pos = unsafe { pos.add(1) };
true
}
} else {
// Regular line break. Stop searching, so fall through to `handle_match`
// which raises an error. Already consumed the line break.
debug_assert!(matches!(next_byte, b'\r' | b'\n'));
false
}
})
}
}
}
},
handle_eof: 0, // Fall through to below
};

// Line break found (legal end is handled above)
self.error(diagnostics::UnterminatedRegExp(self.unterminated_range()));
(self.offset(), RegExpFlags::empty())
}

/// Read regex flags.
/// # SAFETY
/// Byte at `pos` must be ASCII.
unsafe fn read_regex_flags(&mut self, pos: SourcePosition) -> (u32, RegExpFlags) {
let pattern_end = self.source.offset_of(pos);

// SAFETY: Caller guarantees byte at `pos` is ASCII, so advancing 1 byte is on UTF-8 char boundary.
let after_slash = pos.add(1);
self.source.set_position(after_slash);

let pattern_end = self.offset() - 1; // -1 to exclude `/`
let mut flags = RegExpFlags::empty();
while let Some(b) = self.source.peek_byte() {
if !MAYBE_REGEX_FLAG_TABLE.matches(b) {
break;
}

// SAFETY: `MAYBE_REGEX_FLAG_TABLE` only matches ASCII bytes, so consuming 1 byte
// will leave `source` on a UTF-8 char boundary
self.source.next_byte_unchecked();

while let Some(ch @ ('$' | '_' | 'a'..='z' | 'A'..='Z' | '0'..='9')) = self.peek() {
self.consume_char();
let Ok(flag) = RegExpFlags::try_from(ch) else {
self.error(diagnostics::RegExpFlag(ch, self.current_offset()));
let Ok(flag) = RegExpFlags::try_from(b as char) else {
self.error(diagnostics::RegExpFlag(b as char, self.current_offset()));
continue;
};
if flags.contains(flag) {
self.error(diagnostics::RegExpFlagTwice(ch, self.current_offset()));
self.error(diagnostics::RegExpFlagTwice(b as char, self.current_offset()));
continue;
}
flags |= flag;
Expand Down

0 comments on commit 379ff0d

Please sign in to comment.