From 0852391073be3fa78b555aea27e16855d403e5a8 Mon Sep 17 00:00:00 2001 From: overlookmotel Date: Fri, 1 Mar 2024 00:36:05 +0000 Subject: [PATCH] perf(parser): faster lexing JSX identifiers --- crates/oxc_parser/src/lexer/jsx.rs | 69 ++++++++++++++++++++++++------ 1 file changed, 57 insertions(+), 12 deletions(-) diff --git a/crates/oxc_parser/src/lexer/jsx.rs b/crates/oxc_parser/src/lexer/jsx.rs index c1c506fefc9899..4a4f470e9eb27a 100644 --- a/crates/oxc_parser/src/lexer/jsx.rs +++ b/crates/oxc_parser/src/lexer/jsx.rs @@ -1,8 +1,18 @@ -use super::{Kind, Lexer, Token}; +use super::{ + cold_branch, + search::{byte_search, safe_byte_match_table, SafeByteMatchTable}, + Kind, Lexer, Token, +}; use crate::diagnostics; use memchr::memchr; -use oxc_syntax::identifier::{is_identifier_part, is_identifier_start}; +use oxc_syntax::identifier::{is_identifier_part, is_identifier_start_unicode}; + +static ASCII_JSX_ID_START_TABLE: SafeByteMatchTable = + safe_byte_match_table!(|b| b.is_ascii_alphabetic() || matches!(b, b'_' | b'$' | b'-')); + +static NOT_ASCII_JSX_ID_CONTINUE_TABLE: SafeByteMatchTable = + safe_byte_match_table!(|b| !(b.is_ascii_alphanumeric() || matches!(b, b'_' | b'$' | b'-'))); impl<'a> Lexer<'a> { /// `JSXDoubleStringCharacters` :: @@ -94,19 +104,54 @@ impl<'a> Lexer<'a> { /// `JSXIdentifier` `IdentifierPart` /// `JSXIdentifier` [no `WhiteSpace` or Comment here] - fn read_jsx_identifier(&mut self, _start_offset: u32) -> Kind { - while let Some(c) = self.peek() { - if c == '-' || is_identifier_start(c) { - self.consume_char(); - while let Some(c) = self.peek() { - if is_identifier_part(c) { - self.consume_char(); - } else { - break; - } + // Handle start char + let Some(start_byte) = self.source.peek_byte() else { + return Kind::Ident; + }; + + if !start_byte.is_ascii() { + // Unicode identifiers are rare, so cold branch + return cold_branch(|| { + if is_identifier_start_unicode(self.peek().unwrap()) { + self.consume_char(); + self.read_jsx_identifier_tail_unicode() + } else { + Kind::Ident } - } else { + }); + } + + if !ASCII_JSX_ID_START_TABLE.matches(start_byte) { + return Kind::Ident; + } + + // Consume bytes which are part of identifier tail + let next_byte = byte_search! { + lexer: self, + table: NOT_ASCII_JSX_ID_CONTINUE_TABLE, + handle_eof: { + return Kind::Ident; + }, + }; + + // Found a matching byte. + // Either end of identifier found, or a Unicode char. + if !next_byte.is_ascii() { + return self.read_jsx_identifier_tail_unicode(); + } + + Kind::Ident + } + + /// Consume rest of JSX identifier after Unicode character found. + /// `#[cold]` because Unicode chars are rare, and want to keep ASCII fast path. + #[cold] + fn read_jsx_identifier_tail_unicode(&mut self) -> Kind { + while let Some(c) = self.peek() { + if !is_identifier_part(c) && c != '-' { break; } + self.consume_char(); } Kind::Ident }