diff --git a/Cargo.lock b/Cargo.lock index 8dec80ce9c7c8..18af93174a7e6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -53,6 +53,12 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7079075b41f533b8c61d2a4d073c4676e1f8b249ff94a393b0595db304e0dd87" +[[package]] +name = "assert-unchecked" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7330592adf847ee2e3513587b4db2db410a0d751378654e7e993d9adcbe5c795" + [[package]] name = "async-trait" version = "0.1.77" @@ -1735,6 +1741,7 @@ dependencies = [ name = "oxc_parser" version = "0.5.0" dependencies = [ + "assert-unchecked", "bitflags 2.4.1", "miette", "num-bigint", diff --git a/Cargo.toml b/Cargo.toml index 201e6fd85ca84..621dbacfd8afb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -83,6 +83,7 @@ oxc_prettier = { path = "crates/oxc_prettier" } oxc_tasks_common = { path = "tasks/common" } oxc_language_server = { path = "crates/oxc_language_server" } +assert-unchecked = { version = "0.1.2" } bpaf = { version = "0.9.8" } bitflags = { version = "2.4.1" } bumpalo = { version = "3.14.0" } diff --git a/crates/oxc_parser/Cargo.toml b/crates/oxc_parser/Cargo.toml index 16635051d228e..36289093450ad 100644 --- a/crates/oxc_parser/Cargo.toml +++ b/crates/oxc_parser/Cargo.toml @@ -25,9 +25,10 @@ oxc_syntax = { workspace = true } oxc_diagnostics = { workspace = true } oxc_index = { workspace = true } -bitflags = { workspace = true } -rustc-hash = { workspace = true } -num-bigint = { workspace = true } +assert-unchecked = { workspace = true } +bitflags = { workspace = true } +rustc-hash = { workspace = true } +num-bigint = { workspace = true } [dev-dependencies] oxc_ast = { workspace = true, features = ["serde"] } diff --git a/crates/oxc_parser/src/lexer/mod.rs b/crates/oxc_parser/src/lexer/mod.rs index fe56d53be7887..9532f5077ae79 100644 --- a/crates/oxc_parser/src/lexer/mod.rs +++ b/crates/oxc_parser/src/lexer/mod.rs @@ -11,6 +11,7 @@ mod string_builder; mod token; mod trivia_builder; +use assert_unchecked::assert_unchecked; use rustc_hash::FxHashMap; use std::{collections::VecDeque, str::Chars}; @@ -270,6 +271,20 @@ impl<'a> Lexer<'a> { self.current.chars.next().unwrap() } + /// Consume the current char when it's known to be ASCII. + /// This compiles down to a single instruction, just incrementing `chars` iterator's pointer. + /// NOTE: Caller must ensure not at EOF and current char is ASCII. + #[inline] + fn consume_ascii_char(&mut self) -> char { + let s = self.current.chars.as_str(); + // SAFETY: Caller must ensure not at EOF and current char is ASCII. + unsafe { + assert_unchecked!(!s.is_empty()); + assert_unchecked!(s.as_bytes()[0] < 128); + } + self.current.chars.next().unwrap() + } + /// Peek the next char without advancing the position #[inline] fn peek(&self) -> Option { @@ -1315,28 +1330,33 @@ static BYTE_HANDLERS: [ByteHandler; 128] = [ L_P, IDT, L_R, L_S, L_T, L_U, L_V, L_W, IDT, L_Y, IDT, BEO, PIP, BEC, TLD, ERR, // 7 ]; +// `\0` `\1` etc const ERR: ByteHandler = |lexer| { - let c = lexer.consume_char(); + // Next char is an ASCII char e.g. `\0` + let c = lexer.consume_ascii_char(); lexer.error(diagnostics::InvalidCharacter(c, lexer.unterminated_range())); Kind::Undetermined }; -// +// const SPS: ByteHandler = |lexer| { - lexer.consume_char(); + // Next char is an ASCII space character + lexer.consume_ascii_char(); Kind::WhiteSpace }; // '\r' '\n' const LIN: ByteHandler = |lexer| { - lexer.consume_char(); + // Next char is `\r` or `\n`, which are both ASCII + lexer.consume_ascii_char(); lexer.current.token.is_on_new_line = true; Kind::NewLine }; // ! const EXL: ByteHandler = |lexer| { - lexer.consume_char(); + // Next char is `!`, which is ASCII + lexer.consume_ascii_char(); if lexer.next_eq('=') { if lexer.next_eq('=') { Kind::Neq2 @@ -1350,7 +1370,8 @@ const EXL: ByteHandler = |lexer| { // ' " const QOT: ByteHandler = |lexer| { - let c = lexer.consume_char(); + // Next char is `'` or `"`, which are both ASCII + let c = lexer.consume_ascii_char(); if lexer.context == LexerContext::JsxAttributeValue { lexer.read_jsx_string_literal(c) } else { @@ -1360,7 +1381,8 @@ const QOT: ByteHandler = |lexer| { // # const HAS: ByteHandler = |lexer| { - lexer.consume_char(); + // Next char is `#`, which is ASCII + lexer.consume_ascii_char(); // HashbangComment :: // `#!` SingleLineCommentChars? if lexer.current.token.start == 0 && lexer.next_eq('!') { @@ -1377,7 +1399,8 @@ const IDT: ByteHandler = |lexer| { // % const PRC: ByteHandler = |lexer| { - lexer.consume_char(); + // Next char is `%`, which is ASCII + lexer.consume_ascii_char(); if lexer.next_eq('=') { Kind::PercentEq } else { @@ -1387,7 +1410,8 @@ const PRC: ByteHandler = |lexer| { // & const AMP: ByteHandler = |lexer| { - lexer.consume_char(); + // Next char is `&`, which is ASCII + lexer.consume_ascii_char(); if lexer.next_eq('&') { if lexer.next_eq('=') { Kind::Amp2Eq @@ -1403,19 +1427,22 @@ const AMP: ByteHandler = |lexer| { // ( const PNO: ByteHandler = |lexer| { - lexer.consume_char(); + // Next char is `(`, which is ASCII + lexer.consume_ascii_char(); Kind::LParen }; // ) const PNC: ByteHandler = |lexer| { - lexer.consume_char(); + // Next char is `)`, which is ASCII + lexer.consume_ascii_char(); Kind::RParen }; // * const ATR: ByteHandler = |lexer| { - lexer.consume_char(); + // Next char is `*`, which is ASCII + lexer.consume_ascii_char(); if lexer.next_eq('*') { if lexer.next_eq('=') { Kind::Star2Eq @@ -1431,7 +1458,8 @@ const ATR: ByteHandler = |lexer| { // + const PLS: ByteHandler = |lexer| { - lexer.consume_char(); + // Next char is `+`, which is ASCII + lexer.consume_ascii_char(); if lexer.next_eq('+') { Kind::Plus2 } else if lexer.next_eq('=') { @@ -1443,25 +1471,29 @@ const PLS: ByteHandler = |lexer| { // , const COM: ByteHandler = |lexer| { - lexer.consume_char(); + // Next char is `,`, which is ASCII + lexer.consume_ascii_char(); Kind::Comma }; // - const MIN: ByteHandler = |lexer| { - lexer.consume_char(); + // Next char is `-`, which is ASCII + lexer.consume_ascii_char(); lexer.read_minus().unwrap_or_else(|| lexer.skip_single_line_comment()) }; // . const PRD: ByteHandler = |lexer| { - lexer.consume_char(); + // Next char is `.`, which is ASCII + lexer.consume_ascii_char(); lexer.read_dot() }; // / const SLH: ByteHandler = |lexer| { - lexer.consume_char(); + // Next char is `/`, which is ASCII + lexer.consume_ascii_char(); match lexer.peek() { Some('/') => { lexer.current.chars.next(); @@ -1484,37 +1516,43 @@ const SLH: ByteHandler = |lexer| { // 0 const ZER: ByteHandler = |lexer| { - lexer.consume_char(); + // Next char is `0`, which is ASCII + lexer.consume_ascii_char(); lexer.read_zero() }; // 1 to 9 const DIG: ByteHandler = |lexer| { - lexer.consume_char(); + // Next char is an ASCII digit + lexer.consume_ascii_char(); lexer.decimal_literal_after_first_digit() }; // : const COL: ByteHandler = |lexer| { - lexer.consume_char(); + // Next char is `:`, which is ASCII + lexer.consume_ascii_char(); Kind::Colon }; // ; const SEM: ByteHandler = |lexer| { - lexer.consume_char(); + // Next char is `;`, which is ASCII + lexer.consume_ascii_char(); Kind::Semicolon }; // < const LSS: ByteHandler = |lexer| { - lexer.consume_char(); + // Next char is `<`, which is ASCII + lexer.consume_ascii_char(); lexer.read_left_angle().unwrap_or_else(|| lexer.skip_single_line_comment()) }; // = const EQL: ByteHandler = |lexer| { - lexer.consume_char(); + // Next char is `=`, which is ASCII + lexer.consume_ascii_char(); if lexer.next_eq('=') { if lexer.next_eq('=') { Kind::Eq3 @@ -1530,14 +1568,16 @@ const EQL: ByteHandler = |lexer| { // > const GTR: ByteHandler = |lexer| { - lexer.consume_char(); + // Next char is `>`, which is ASCII + lexer.consume_ascii_char(); // `>=` is re-lexed with [Lexer::next_jsx_child] Kind::RAngle }; // ? const QST: ByteHandler = |lexer| { - lexer.consume_char(); + // Next char is `?`, which is ASCII + lexer.consume_ascii_char(); if lexer.next_eq('?') { if lexer.next_eq('=') { Kind::Question2Eq @@ -1559,20 +1599,26 @@ const QST: ByteHandler = |lexer| { // @ const AT_: ByteHandler = |lexer| { - lexer.consume_char(); + // Next char is `@`, which is ASCII + lexer.consume_ascii_char(); Kind::At }; // [ const BTO: ByteHandler = |lexer| { - lexer.consume_char(); + // Next char is `[`, which is ASCII + lexer.consume_ascii_char(); Kind::LBrack }; // \ const ESC: ByteHandler = |lexer| { - let mut builder = AutoCow::new(lexer); - lexer.consume_char(); + let lexer_ref = lexer as &Lexer<'_>; + let mut builder = AutoCow::new(lexer_ref); + // Next char at start of this function was `\`, which is ASCII. + // `AutoCow::new` cannot have changed the state of `lexer.current.chars` iterator, + // as we explicitly passed it only an immutable reference. + lexer.consume_ascii_char(); builder.force_allocation_without_current_ascii_char(lexer); lexer.identifier_unicode_escape_sequence(&mut builder, true); let text = lexer.identifier_name(builder); @@ -1581,13 +1627,15 @@ const ESC: ByteHandler = |lexer| { // ] const BTC: ByteHandler = |lexer| { - lexer.consume_char(); + // Next char is `]`, which is ASCII + lexer.consume_ascii_char(); Kind::RBrack }; // ^ const CRT: ByteHandler = |lexer| { - lexer.consume_char(); + // Next char is `^`, which is ASCII + lexer.consume_ascii_char(); if lexer.next_eq('=') { Kind::CaretEq } else { @@ -1597,19 +1645,22 @@ const CRT: ByteHandler = |lexer| { // ` const TPL: ByteHandler = |lexer| { - lexer.consume_char(); + // Next char is '`', which is ASCII + lexer.consume_ascii_char(); lexer.read_template_literal(Kind::TemplateHead, Kind::NoSubstitutionTemplate) }; // { const BEO: ByteHandler = |lexer| { - lexer.consume_char(); + // Next char is `{`, which is ASCII + lexer.consume_ascii_char(); Kind::LCurly }; // | const PIP: ByteHandler = |lexer| { - lexer.consume_char(); + // Next char is `|`, which is ASCII + lexer.consume_ascii_char(); if lexer.next_eq('|') { if lexer.next_eq('=') { Kind::Pipe2Eq @@ -1625,13 +1676,15 @@ const PIP: ByteHandler = |lexer| { // } const BEC: ByteHandler = |lexer| { - lexer.consume_char(); + // Next char is `}`, which is ASCII + lexer.consume_ascii_char(); Kind::RCurly }; // ~ const TLD: ByteHandler = |lexer| { - lexer.consume_char(); + // Next char is `~`, which is ASCII + lexer.consume_ascii_char(); Kind::Tilde };