Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

isutf8: implement RFC 3629 #1045

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 15 additions & 3 deletions libc/str/isutf8.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ static const char kUtf8Dispatch[] = {
1, 1, 1, 1, 1, 1, 1, 1, // 0320
1, 1, 1, 1, 1, 1, 1, 1, // 0330
2, 3, 3, 3, 3, 3, 3, 3, // 0340 utf8-3
3, 3, 3, 3, 3, 3, 3, 3, // 0350
4, 5, 5, 5, 5, 0, 0, 0, // 0360 utf8-4
3, 3, 3, 3, 3, 4, 3, 3, // 0350
5, 6, 6, 6, 7, 0, 0, 0, // 0360 utf8-4
0, 0, 0, 0, 0, 0, 0, 0, // 0370
};

Expand Down Expand Up @@ -94,6 +94,7 @@ bool32 isutf8(const void *data, size_t size) {
}
// fallthrough
case 3:
case_utf8_3:
if (p + 2 <= e && //
(p[0] & 0300) == 0200 && //
(p[1] & 0300) == 0200) { //
Expand All @@ -103,11 +104,17 @@ bool32 isutf8(const void *data, size_t size) {
return false; // missing cont
}
case 4:
if (p < e && (*p & 040)) {
return false; // utf-16 surrogate
}
goto case_utf8_3;
case 5:
if (p < e && (*p & 0377) < 0220) {
return false; // overlong
}
// fallthrough
case 5:
case 6:
case_utf8_4:
if (p + 3 <= e && //
(((uint32_t)(p[+2] & 0377) << 030 | //
(uint32_t)(p[+1] & 0377) << 020 | //
Expand All @@ -119,6 +126,11 @@ bool32 isutf8(const void *data, size_t size) {
} else {
return false; // missing cont
}
case 7:
if (p < e && (*p & 0x3F) > 0xF) {
return false; // over limit
}
goto case_utf8_4;
default:
__builtin_unreachable();
}
Expand Down
6 changes: 6 additions & 0 deletions test/libc/str/isutf8_test.c
Original file line number Diff line number Diff line change
Expand Up @@ -39,13 +39,19 @@ TEST(isutf8, good) {
"剑号巨阙 珠称夜光 果珍李柰 菜重芥姜 海咸河淡 鳞潜羽翔"
"龙师火帝 鸟官人皇 始制文字 乃服衣裳 推位让国 有虞陶唐",
-1));
EXPECT_TRUE(isutf8("\xf4\x8f\xbf\xbf", -1));
EXPECT_TRUE(isutf8("\xed\x9f\xbf", -1));
EXPECT_TRUE(isutf8("\xee\x80\x80", -1));
}

TEST(isutf8, bad) {
ASSERT_FALSE(isutf8("\300\200", -1)); // overlong nul
ASSERT_FALSE(isutf8("\200\300", -1)); // latin1 c1 control code
ASSERT_FALSE(isutf8("\300\300", -1)); // missing continuation
ASSERT_FALSE(isutf8("\377\200\200\200\200", -1)); // thompson-pike varint
ASSERT_FALSE(isutf8("\xf4\x90\x80\x80", -1)); // over limit
ASSERT_FALSE(isutf8("\xed\xa0\x80", -1));
ASSERT_FALSE(isutf8("\xed\xbf\xbf", -1)); // surrogate pairs
}

TEST(isutf8, oob) {
Expand Down