From 7aaf22339c9418d9c021f5409b74c4d2e9ed8b48 Mon Sep 17 00:00:00 2001
From: kamphaus <christophe.kamphaus@gmail.com>
Date: Mon, 4 Jun 2018 02:06:14 +0200
Subject: [PATCH] CTable improvements. Fixes #738 (#739)

---
 darwin/ctype.go | 24 ++++++++++++++----------
 linux/ctype.go  | 11 +++++++++--
 tests/ctype.c   | 19 +++++++++++++++++++
 3 files changed, 42 insertions(+), 12 deletions(-)

diff --git a/darwin/ctype.go b/darwin/ctype.go
index 1c8836340..54216cbb3 100644
--- a/darwin/ctype.go
+++ b/darwin/ctype.go
@@ -42,48 +42,52 @@ const (
 // to handle this, so if you know one please consider putting in a PR :)
 func IsType(_c CtRuneT, _f uint32) uint32 {
 	// These are the easy ones.
-	if _f&CtypeA != 0 && unicode.IsLetter(rune(_c)) {
+	if _f&CtypeA != 0 && unicode.IsLetter(rune(_c)) && rune(_c) < 0x80 {
 		return 1
 	}
 
-	if _f&CtypeC != 0 && unicode.IsControl(rune(_c)) {
+	if _f&CtypeC != 0 && unicode.IsControl(rune(_c)) && rune(_c) < 0x80 {
 		return 1
 	}
 
-	if _f&CtypeD != 0 && unicode.IsDigit(rune(_c)) {
+	if _f&CtypeD != 0 && unicode.IsDigit(rune(_c)) && rune(_c) < 0x80 {
 		return 1
 	}
 
 	// The IsSpace check is required because Go treats spaces as graphic
 	// characters, which C does not.
-	if _f&CtypeG != 0 && unicode.IsGraphic(rune(_c)) && !unicode.IsSpace(rune(_c)) {
+	if _f&CtypeG != 0 && unicode.IsGraphic(rune(_c)) && !unicode.IsSpace(rune(_c)) && rune(_c) < 0x80 {
 		return 1
 	}
 
-	if _f&CtypeL != 0 && unicode.IsLower(rune(_c)) {
+	if _f&CtypeL != 0 && unicode.IsLower(rune(_c)) && rune(_c) < 0x80 {
 		return 1
 	}
 
-	if _f&CtypeP != 0 && unicode.IsPunct(rune(_c)) {
+	// Need to check for 0x24, 0x2b, 0x3c-0x3e, 0x5e, 0x60, 0x7c, 0x7e
+	// because Go doesn't treat $+<=>^`|~ as punctuation.
+	if _f&CtypeP != 0 && rune(_c) < 0x80 && (unicode.IsPunct(rune(_c)) || rune(_c) == 0x24 || rune(_c) == 0x2b ||
+		(rune(_c) >= 0x3c && rune(_c) <= 0x3e) || rune(_c) == 0x5e || rune(_c) == 0x60 ||
+		rune(_c) == 0x7c || rune(_c) == 0x7e) {
 		return 1
 	}
 
-	if _f&CtypeS != 0 && unicode.IsSpace(rune(_c)) {
+	if _f&CtypeS != 0 && unicode.IsSpace(rune(_c)) && rune(_c) < 0x80 {
 		return 1
 	}
 
-	if _f&CtypeU != 0 && unicode.IsUpper(rune(_c)) {
+	if _f&CtypeU != 0 && unicode.IsUpper(rune(_c)) && rune(_c) < 0x80 {
 		return 1
 	}
 
-	if _f&CtypeR != 0 && unicode.IsPrint(rune(_c)) {
+	if _f&CtypeR != 0 && unicode.IsPrint(rune(_c)) && rune(_c) < 0x80 {
 		return 1
 	}
 
 	// TODO: Is this really the right way to do this?
 	if _f&CtypeX != 0 && (unicode.IsDigit(rune(_c)) ||
 		(_c >= 'a' && _c <= 'f') ||
-		(_c >= 'A' && _c <= 'F')) {
+		(_c >= 'A' && _c <= 'F')) && rune(_c) < 0x80 {
 		return 1
 	}
 
diff --git a/linux/ctype.go b/linux/ctype.go
index f9ad7f51f..fbdc33ad8 100644
--- a/linux/ctype.go
+++ b/linux/ctype.go
@@ -7,7 +7,7 @@ import (
 var characterTable []uint16
 
 func generateCharacterTable() {
-	for i := 0; i < 255; i++ {
+	for i := 0; i < 0x80; i++ {
 		var c uint16
 
 		// Each of the bitwise expressions below were copied from the enum
@@ -60,7 +60,10 @@ func generateCharacterTable() {
 			c |= ((1 << (9)) >> 8)
 		}
 
-		if unicode.IsPunct(rune(i)) {
+		// Need to check for 0x24, 0x2b, 0x3c-0x3e, 0x5e, 0x60, 0x7c, 0x7e
+		// because Go doesn't treat $+<=>^`|~ as punctuation.
+		if unicode.IsPunct(rune(i)) || i == 0x24 || i == 0x2b || (i >= 0x3c && i <= 0x3e) || i == 0x5e || i == 0x60 ||
+			i == 0x7c || i == 0x7e {
 			c |= ((1 << (10)) >> 8)
 		}
 
@@ -72,6 +75,10 @@ func generateCharacterTable() {
 		// test if this works right now.
 		characterTable = append(characterTable, c)
 	}
+	for i := 0x80; i < 256; i++ {
+		// false for all characters > 0x7f
+		characterTable = append(characterTable, 0)
+	}
 }
 
 // CtypeLoc handles __ctype_b_loc(). It returns a character table.
diff --git a/tests/ctype.c b/tests/ctype.c
index 8d54825b4..d5fe4104f 100644
--- a/tests/ctype.c
+++ b/tests/ctype.c
@@ -24,6 +24,8 @@
 char *strnul = "this string has a \0 NUL";
 char arrnul[] = "this string has a \0 NUL";
 
+#define PRINTF_BOOL(v) { if(v) printf("T"); else printf("F"); }
+
 int main()
 {
   plan(104);
@@ -49,6 +51,23 @@ int main()
   _CTYPE(isupper, F, T, F, F, F, F, F, F);
   CTYPE(isxdigit, T, T, T, F, F, F, F, F);
 
+  diag("char properties for characters 0-255:");
+  for(int i=0; i<256; i++) {
+    printf("%x: ", i);
+    PRINTF_BOOL(isalnum(i));
+    PRINTF_BOOL(isalpha(i));
+    PRINTF_BOOL(iscntrl(i));
+    PRINTF_BOOL(isdigit(i));
+    PRINTF_BOOL(isgraph(i));
+    PRINTF_BOOL(islower(i));
+    PRINTF_BOOL(isprint(i));
+    PRINTF_BOOL(ispunct(i));
+    PRINTF_BOOL(isspace(i));
+    PRINTF_BOOL(isupper(i));
+    PRINTF_BOOL(isxdigit(i));
+    printf("\n");
+  }
+
   diag("tolower");
   is_eq(tolower('a'), 'a');
   is_eq(tolower('B'), 'b');