From b0ed72f1792dec8d9538e5ac8ec6c0fb085eebf4 Mon Sep 17 00:00:00 2001 From: Evan Haas Date: Tue, 7 Nov 2023 19:19:59 -0800 Subject: [PATCH] CharInfo: Remove ascii check from xidStart / xidContinue Tokenizer takes care of basic char set characters in identifiers --- src/CharInfo.zig | 20 +++++--------------- src/unicode/identifier_tables.zig | 17 ----------------- 2 files changed, 5 insertions(+), 32 deletions(-) diff --git a/src/CharInfo.zig b/src/CharInfo.zig index 426c3196..73d0d5db 100644 --- a/src/CharInfo.zig +++ b/src/CharInfo.zig @@ -489,9 +489,7 @@ pub fn homoglyph(codepoint: u21) ?u21 { } pub fn isXidStart(c: u21) bool { - if (c < tables.ascii_start.len) { - return tables.ascii_start[c]; - } + assert(c > 0x7F); const idx = c / 8 / tables.chunk; const chunk: usize = if (idx < tables.trie_start.len) tables.trie_start[idx] else 0; const offset = chunk * tables.chunk / 2 + c / 8 % tables.chunk; @@ -499,9 +497,7 @@ pub fn isXidStart(c: u21) bool { } pub fn isXidContinue(c: u21) bool { - if (c < tables.ascii_continue.len) { - return tables.ascii_continue[c]; - } + assert(c > 0x7F); const idx = c / 8 / tables.chunk; const chunk: usize = if (idx < tables.trie_continue.len) tables.trie_continue[idx] else 0; const offset = chunk * tables.chunk / 2 + c / 8 % tables.chunk; @@ -510,7 +506,7 @@ pub fn isXidContinue(c: u21) bool { test "isXidStart / isXidContinue panic check" { const std = @import("std"); - for (0..std.math.maxInt(u21)) |i| { + for (0x80..0x110000) |i| { const c: u21 = @intCast(i); if (std.unicode.utf8ValidCodepoint(c)) { _ = isXidStart(c); @@ -521,10 +517,7 @@ test "isXidStart / isXidContinue panic check" { test isXidStart { const std = @import("std"); - try std.testing.expect(isXidStart('a')); - try std.testing.expect(isXidStart('Z')); - try std.testing.expect(!isXidStart('0')); - try std.testing.expect(!isXidStart(' ')); + try std.testing.expect(!isXidStart('᠑')); try std.testing.expect(!isXidStart('™')); try std.testing.expect(!isXidStart('£')); try std.testing.expect(!isXidStart('\u{1f914}')); // 🤔 @@ -532,10 +525,7 @@ test isXidStart { test isXidContinue { const std = @import("std"); - try std.testing.expect(isXidContinue('a')); - try std.testing.expect(isXidContinue('Z')); - try std.testing.expect(isXidContinue('0')); - try std.testing.expect(!isXidContinue(' ')); + try std.testing.expect(isXidContinue('᠑')); try std.testing.expect(!isXidContinue('™')); try std.testing.expect(!isXidContinue('£')); try std.testing.expect(!isXidContinue('\u{1f914}')); // 🤔 diff --git a/src/unicode/identifier_tables.zig b/src/unicode/identifier_tables.zig index ac5073c9..dae796d8 100644 --- a/src/unicode/identifier_tables.zig +++ b/src/unicode/identifier_tables.zig @@ -2,23 +2,6 @@ //! and Unicode Standard Annex #31 https://www.unicode.org/reports/tr31/ //! Licensed under the MIT License and the Unicode license -const T: bool = true; -const F: bool = false; - -pub const ascii_start: [128]bool align(64) = .{ - F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, F, - F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, F, -}; - -pub const ascii_continue: [128]bool align(64) = .{ - F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, T, T, T, T, T, T, T, T, T, T, F, F, F, F, F, F, - F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, T, - F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, F, -}; - pub const chunk = 64; pub const trie_start: [402]u8 align(8) = .{