Skip to content

Commit

Permalink
CharInfo: implement C23 rules for identifiers
Browse files Browse the repository at this point in the history
  • Loading branch information
ehaas committed Nov 8, 2023
1 parent 1cdf4d5 commit 2273bb5
Show file tree
Hide file tree
Showing 5 changed files with 767 additions and 2 deletions.
39 changes: 39 additions & 0 deletions LICENSE-UNICODE
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
UNICODE LICENSE V3

COPYRIGHT AND PERMISSION NOTICE

Copyright © 1991-2023 Unicode, Inc.

NOTICE TO USER: Carefully read the following legal agreement. BY
DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING DATA FILES, AND/OR
SOFTWARE, YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
TERMS AND CONDITIONS OF THIS AGREEMENT. IF YOU DO NOT AGREE, DO NOT
DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE THE DATA FILES OR SOFTWARE.

Permission is hereby granted, free of charge, to any person obtaining a
copy of data files and any associated documentation (the "Data Files") or
software and any associated documentation (the "Software") to deal in the
Data Files or Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, and/or sell
copies of the Data Files or Software, and to permit persons to whom the
Data Files or Software are furnished to do so, provided that either (a)
this copyright and permission notice appear with all copies of the Data
Files or Software, or (b) this copyright and permission notice appear in
associated Documentation.

THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
THIRD PARTY RIGHTS.

IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE
BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES,
OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE DATA
FILES OR SOFTWARE.

Except as contained in this notice, the name of a copyright holder shall
not be used in advertising or otherwise to promote the sale, use or other
dealings in these Data Files or Software without prior written
authorization of the copyright holder.
55 changes: 55 additions & 0 deletions src/CharInfo.zig
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@
//! various C standards. All classification routines *do not* consider
//! characters from the basic character set; it is assumed those will be
//! checked separately
//! isXidStart and isXidContinue are adapted from https://github.com/dtolnay/unicode-ident

const assert = @import("std").debug.assert;
const tables = @import("unicode/identifier_tables.zig");

/// C11 Standard Annex D
pub fn isC11IdChar(codepoint: u21) bool {
Expand Down Expand Up @@ -485,3 +487,56 @@ pub fn homoglyph(codepoint: u21) ?u21 {
else => null,
};
}

pub fn isXidStart(c: u21) bool {
if (c < tables.ascii_start.len) {
return tables.ascii_start[c];
}
const idx = c / 8 / tables.chunk;
const chunk: usize = if (idx < tables.trie_start.len) tables.trie_start[idx] else 0;
const offset = chunk * tables.chunk / 2 + c / 8 % tables.chunk;
return (tables.leaf[offset] >> (@as(u3, @intCast(c % 8)))) & 1 != 0;
}

pub fn isXidContinue(c: u21) bool {
if (c < tables.ascii_continue.len) {
return tables.ascii_continue[c];
}
const idx = c / 8 / tables.chunk;
const chunk: usize = if (idx < tables.trie_continue.len) tables.trie_continue[idx] else 0;
const offset = chunk * tables.chunk / 2 + c / 8 % tables.chunk;
return (tables.leaf[offset] >> (@as(u3, @intCast(c % 8)))) & 1 != 0;
}

test "isXidStart / isXidContinue panic check" {
const std = @import("std");
for (0..std.math.maxInt(u21)) |i| {
const c: u21 = @intCast(i);
if (std.unicode.utf8ValidCodepoint(c)) {
_ = isXidStart(c);
_ = isXidContinue(c);
}
}
}

test isXidStart {
const std = @import("std");
try std.testing.expect(isXidStart('a'));
try std.testing.expect(isXidStart('Z'));
try std.testing.expect(!isXidStart('0'));
try std.testing.expect(!isXidStart(' '));
try std.testing.expect(!isXidStart('™'));
try std.testing.expect(!isXidStart('£'));
try std.testing.expect(!isXidStart('\u{1f914}')); // 🤔
}

test isXidContinue {
const std = @import("std");
try std.testing.expect(isXidContinue('a'));
try std.testing.expect(isXidContinue('Z'));
try std.testing.expect(isXidContinue('0'));
try std.testing.expect(!isXidContinue(' '));
try std.testing.expect(!isXidContinue('™'));
try std.testing.expect(!isXidContinue('£'));
try std.testing.expect(!isXidContinue('\u{1f914}')); // 🤔
}
8 changes: 6 additions & 2 deletions src/LangOpts.zig
Original file line number Diff line number Diff line change
Expand Up @@ -89,12 +89,16 @@ pub const Standard = enum {

pub fn codepointAllowedInIdentifier(standard: Standard, codepoint: u21, is_start: bool) bool {
if (is_start) {
return if (standard.atLeast(.c11))
return if (standard.atLeast(.c2x))
CharInfo.isXidStart(codepoint)
else if (standard.atLeast(.c11))
CharInfo.isC11IdChar(codepoint) and !CharInfo.isC11DisallowedInitialIdChar(codepoint)
else
CharInfo.isC99IdChar(codepoint) and !CharInfo.isC99DisallowedInitialIDChar(codepoint);
} else {
return if (standard.atLeast(.c11))
return if (standard.atLeast(.c2x))
CharInfo.isXidContinue(codepoint)
else if (standard.atLeast(.c11))
CharInfo.isC11IdChar(codepoint)
else
CharInfo.isC99IdChar(codepoint);
Expand Down
Loading

0 comments on commit 2273bb5

Please sign in to comment.