Skip to content

Commit

Permalink
refactor(parser): small efficiencies in byte_search macro usage (#2554
Browse files Browse the repository at this point in the history
)

A few small efficiencies in usage of `byte_search` macro for lexing comments.
  • Loading branch information
overlookmotel authored Mar 1, 2024
1 parent 3d354d4 commit c579620
Showing 1 changed file with 17 additions and 11 deletions.
28 changes: 17 additions & 11 deletions crates/oxc_parser/src/lexer/comment.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ impl<'a> Lexer<'a> {
self.trivia_builder
.add_single_line_comment(self.token.start, self.source.offset_of(pos));
// SAFETY: Safe to consume `\r` or `\n` as both are ASCII
unsafe { pos = pos.add(1) };
pos = unsafe { pos.add(1) };
// We've found the end. Do not continue searching.
false
} else {
Expand All @@ -48,22 +48,22 @@ impl<'a> Lexer<'a> {
// SAFETY: Next byte is `0xE2` which is always 1st byte of a 3-byte UTF-8 char.
// So safe to advance `pos` by 1 and read 2 bytes.
let next2 = unsafe { pos.add(1).read2() };
if next2 == LS_BYTES_2_AND_3 || next2 == PS_BYTES_2_AND_3 {
if matches!(next2, LS_BYTES_2_AND_3 | PS_BYTES_2_AND_3) {
// Irregular line break
self.trivia_builder
.add_single_line_comment(self.token.start, self.source.offset_of(pos));
// Advance `pos` to after this char.
// SAFETY: `0xE2` is always 1st byte of a 3-byte UTF-8 char,
// so consuming 3 bytes will place `pos` on next UTF-8 char boundary.
unsafe { pos = pos.add(3) };
pos = unsafe { pos.add(3) };
// We've found the end. Do not continue searching.
false
} else {
// Some other Unicode char beginning with `0xE2`.
// Skip 3 bytes (macro skips 1 already, so skip 2 here), and continue searching.
// SAFETY: `0xE2` is always 1st byte of a 3-byte UTF-8 char,
// so consuming 3 bytes will place `pos` on next UTF-8 char boundary.
unsafe { pos = pos.add(2) };
pos = unsafe { pos.add(2) };
true
}
})
Expand Down Expand Up @@ -93,13 +93,15 @@ impl<'a> Lexer<'a> {
continue_if: |next_byte, pos| {
// Match found. Decide whether to continue searching.
if next_byte == b'*' {
if pos.addr() < self.source.end_addr() - 1 {
// SAFETY: Next byte is `*` (ASCII) so after it is UTF-8 char boundary
let after_star = unsafe { pos.add(1) };
if after_star.addr() < self.source.end_addr() {
// If next byte isn't `/`, continue
// SAFETY: Have checked there's at least 1 further byte to read
if unsafe { pos.add(1).read() } == b'/' {
if unsafe { after_star.read() } == b'/' {
// Consume `*/`
// SAFETY: Consuming `*/` leaves `pos` on a UTF-8 char boundary
unsafe { pos = pos.add(2) };
pos = unsafe { pos.add(2) };
false
} else {
true
Expand All @@ -115,19 +117,23 @@ impl<'a> Lexer<'a> {
cold_branch(|| {
// SAFETY: Next byte is `0xE2` which is always 1st byte of a 3-byte UTF-8 char.
// So safe to advance `pos` by 1 and read 2 bytes.
let next2 = unsafe { pos.add(1).read2() };
if next2 == LS_BYTES_2_AND_3 || next2 == PS_BYTES_2_AND_3 {
let next2 = unsafe {
pos = pos.add(1);
pos.read2()
};
if matches!(next2, LS_BYTES_2_AND_3 | PS_BYTES_2_AND_3) {
// Irregular line break
self.token.is_on_new_line = true;
// Ideally we'd go on to `skip_multi_line_comment_after_line_break` here
// but can't do that easily because can't use `return` in a closure.
// But irregular line breaks are rare anyway.
}
// Either way, continue searching.
// Skip 3 bytes (macro skips 1 already, so skip 2 here), and continue searching.
// Skip 3 bytes (skipped 1 byte above, macro skips 1 more, so skip 1 more here
// to make 3), and continue searching.
// SAFETY: `0xE2` is always 1st byte of a 3-byte UTF-8 char,
// so consuming 3 bytes will place `pos` on next UTF-8 char boundary.
unsafe { pos = pos.add(2) };
pos = unsafe { pos.add(1) };
true
})
} else {
Expand Down

0 comments on commit c579620

Please sign in to comment.