From 7aaf22339c9418d9c021f5409b74c4d2e9ed8b48 Mon Sep 17 00:00:00 2001 From: kamphaus Date: Mon, 4 Jun 2018 02:06:14 +0200 Subject: [PATCH] CTable improvements. Fixes #738 (#739) --- darwin/ctype.go | 24 ++++++++++++++---------- linux/ctype.go | 11 +++++++++-- tests/ctype.c | 19 +++++++++++++++++++ 3 files changed, 42 insertions(+), 12 deletions(-) diff --git a/darwin/ctype.go b/darwin/ctype.go index 1c8836340..54216cbb3 100644 --- a/darwin/ctype.go +++ b/darwin/ctype.go @@ -42,48 +42,52 @@ const ( // to handle this, so if you know one please consider putting in a PR :) func IsType(_c CtRuneT, _f uint32) uint32 { // These are the easy ones. - if _f&CtypeA != 0 && unicode.IsLetter(rune(_c)) { + if _f&CtypeA != 0 && unicode.IsLetter(rune(_c)) && rune(_c) < 0x80 { return 1 } - if _f&CtypeC != 0 && unicode.IsControl(rune(_c)) { + if _f&CtypeC != 0 && unicode.IsControl(rune(_c)) && rune(_c) < 0x80 { return 1 } - if _f&CtypeD != 0 && unicode.IsDigit(rune(_c)) { + if _f&CtypeD != 0 && unicode.IsDigit(rune(_c)) && rune(_c) < 0x80 { return 1 } // The IsSpace check is required because Go treats spaces as graphic // characters, which C does not. - if _f&CtypeG != 0 && unicode.IsGraphic(rune(_c)) && !unicode.IsSpace(rune(_c)) { + if _f&CtypeG != 0 && unicode.IsGraphic(rune(_c)) && !unicode.IsSpace(rune(_c)) && rune(_c) < 0x80 { return 1 } - if _f&CtypeL != 0 && unicode.IsLower(rune(_c)) { + if _f&CtypeL != 0 && unicode.IsLower(rune(_c)) && rune(_c) < 0x80 { return 1 } - if _f&CtypeP != 0 && unicode.IsPunct(rune(_c)) { + // Need to check for 0x24, 0x2b, 0x3c-0x3e, 0x5e, 0x60, 0x7c, 0x7e + // because Go doesn't treat $+<=>^`|~ as punctuation. + if _f&CtypeP != 0 && rune(_c) < 0x80 && (unicode.IsPunct(rune(_c)) || rune(_c) == 0x24 || rune(_c) == 0x2b || + (rune(_c) >= 0x3c && rune(_c) <= 0x3e) || rune(_c) == 0x5e || rune(_c) == 0x60 || + rune(_c) == 0x7c || rune(_c) == 0x7e) { return 1 } - if _f&CtypeS != 0 && unicode.IsSpace(rune(_c)) { + if _f&CtypeS != 0 && unicode.IsSpace(rune(_c)) && rune(_c) < 0x80 { return 1 } - if _f&CtypeU != 0 && unicode.IsUpper(rune(_c)) { + if _f&CtypeU != 0 && unicode.IsUpper(rune(_c)) && rune(_c) < 0x80 { return 1 } - if _f&CtypeR != 0 && unicode.IsPrint(rune(_c)) { + if _f&CtypeR != 0 && unicode.IsPrint(rune(_c)) && rune(_c) < 0x80 { return 1 } // TODO: Is this really the right way to do this? if _f&CtypeX != 0 && (unicode.IsDigit(rune(_c)) || (_c >= 'a' && _c <= 'f') || - (_c >= 'A' && _c <= 'F')) { + (_c >= 'A' && _c <= 'F')) && rune(_c) < 0x80 { return 1 } diff --git a/linux/ctype.go b/linux/ctype.go index f9ad7f51f..fbdc33ad8 100644 --- a/linux/ctype.go +++ b/linux/ctype.go @@ -7,7 +7,7 @@ import ( var characterTable []uint16 func generateCharacterTable() { - for i := 0; i < 255; i++ { + for i := 0; i < 0x80; i++ { var c uint16 // Each of the bitwise expressions below were copied from the enum @@ -60,7 +60,10 @@ func generateCharacterTable() { c |= ((1 << (9)) >> 8) } - if unicode.IsPunct(rune(i)) { + // Need to check for 0x24, 0x2b, 0x3c-0x3e, 0x5e, 0x60, 0x7c, 0x7e + // because Go doesn't treat $+<=>^`|~ as punctuation. + if unicode.IsPunct(rune(i)) || i == 0x24 || i == 0x2b || (i >= 0x3c && i <= 0x3e) || i == 0x5e || i == 0x60 || + i == 0x7c || i == 0x7e { c |= ((1 << (10)) >> 8) } @@ -72,6 +75,10 @@ func generateCharacterTable() { // test if this works right now. characterTable = append(characterTable, c) } + for i := 0x80; i < 256; i++ { + // false for all characters > 0x7f + characterTable = append(characterTable, 0) + } } // CtypeLoc handles __ctype_b_loc(). It returns a character table. diff --git a/tests/ctype.c b/tests/ctype.c index 8d54825b4..d5fe4104f 100644 --- a/tests/ctype.c +++ b/tests/ctype.c @@ -24,6 +24,8 @@ char *strnul = "this string has a \0 NUL"; char arrnul[] = "this string has a \0 NUL"; +#define PRINTF_BOOL(v) { if(v) printf("T"); else printf("F"); } + int main() { plan(104); @@ -49,6 +51,23 @@ int main() _CTYPE(isupper, F, T, F, F, F, F, F, F); CTYPE(isxdigit, T, T, T, F, F, F, F, F); + diag("char properties for characters 0-255:"); + for(int i=0; i<256; i++) { + printf("%x: ", i); + PRINTF_BOOL(isalnum(i)); + PRINTF_BOOL(isalpha(i)); + PRINTF_BOOL(iscntrl(i)); + PRINTF_BOOL(isdigit(i)); + PRINTF_BOOL(isgraph(i)); + PRINTF_BOOL(islower(i)); + PRINTF_BOOL(isprint(i)); + PRINTF_BOOL(ispunct(i)); + PRINTF_BOOL(isspace(i)); + PRINTF_BOOL(isupper(i)); + PRINTF_BOOL(isxdigit(i)); + printf("\n"); + } + diag("tolower"); is_eq(tolower('a'), 'a'); is_eq(tolower('B'), 'b');