From 408acb90e67fae2ca20c78f4c888399d278594db Mon Sep 17 00:00:00 2001 From: overlookmotel Date: Tue, 16 Jan 2024 05:14:22 +0000 Subject: [PATCH] refactor(parser): lexer handle unicode without branch (#2039) As suggested by @strager in https://github.com/oxc-project/oxc/pull/2025#pullrequestreview-1820273832, this PR adds `BYTE_HANDLERS` for first bytes of unicode characters. This removes a branch from `read_next_token()` and produces a +1% speed-up on parser benchmarks. --- crates/oxc_parser/src/lexer/mod.rs | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/crates/oxc_parser/src/lexer/mod.rs b/crates/oxc_parser/src/lexer/mod.rs index 9532f5077ae79..dd6c6bdb2991a 100644 --- a/crates/oxc_parser/src/lexer/mod.rs +++ b/crates/oxc_parser/src/lexer/mod.rs @@ -395,11 +395,7 @@ impl<'a> Lexer<'a> { } let byte = remaining.as_bytes()[0]; - let kind = if byte < 128 { - BYTE_HANDLERS[byte as usize](self) - } else { - self.match_unicode_char() - }; + let kind = BYTE_HANDLERS[byte as usize](self); if !matches!( kind, @@ -410,9 +406,7 @@ impl<'a> Lexer<'a> { } } - // `#[cold]` to hint to branch predictor that unicode identifiers and irregular whitespace are rare - #[cold] - fn match_unicode_char(&mut self) -> Kind { + fn unicode_char_handler(&mut self) -> Kind { let c = self.current.chars.clone().next().unwrap(); match c { c if is_id_start_unicode(c) => { @@ -1318,7 +1312,7 @@ type ByteHandler = fn(&mut Lexer<'_>) -> Kind; /// Lookup table mapping any incoming byte to a handler function defined below. /// #[rustfmt::skip] -static BYTE_HANDLERS: [ByteHandler; 128] = [ +static BYTE_HANDLERS: [ByteHandler; 256] = [ // 0 1 2 3 4 5 6 7 8 9 A B C D E F // ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, SPS, LIN, SPS, SPS, LIN, ERR, ERR, // 0 ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, // 1 @@ -1328,6 +1322,14 @@ static BYTE_HANDLERS: [ByteHandler; 128] = [ IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, BTO, ESC, BTC, CRT, IDT, // 5 TPL, L_A, L_B, L_C, L_D, L_E, L_F, L_G, IDT, L_I, IDT, L_K, L_L, L_M, L_N, L_O, // 6 L_P, IDT, L_R, L_S, L_T, L_U, L_V, L_W, IDT, L_Y, IDT, BEO, PIP, BEC, TLD, ERR, // 7 + UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // 8 + UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // 9 + UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // A + UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // B + UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // C + UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // D + UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // E + UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // F ]; // `\0` `\1` etc @@ -1856,3 +1858,7 @@ const L_Y: ByteHandler = |lexer| match &lexer.identifier_name_handler()[1..] { "ield" => Kind::Yield, _ => Kind::Ident, }; + +// Non-ASCII characters +#[allow(clippy::redundant_closure_for_method_calls)] +const UNI: ByteHandler = |lexer| lexer.unicode_char_handler();