From 30d85dfab3ef6a6da51485450cd5122efa5c482c Mon Sep 17 00:00:00 2001
From: Yuval Kohavi <yuval.kohavi@gmail.com>
Date: Fri, 3 May 2024 16:23:32 -0400
Subject: [PATCH] fix edge case with upper case

---
 .gitignore                     |   3 +-
 changelog/v0.7.1/sanitize.yaml |   3 +
 kubeutils/strings.go           | 105 +++++++++++++++++++++++++
 kubeutils/strings_test.go      | 137 +++++++++++++++++++++++++++++++++
 kubeutils/util.go              |   9 +--
 kubeutils/util_test.go         |  70 +++++++++++++++++
 6 files changed, 320 insertions(+), 7 deletions(-)
 create mode 100644 changelog/v0.7.1/sanitize.yaml
 create mode 100644 kubeutils/strings.go
 create mode 100644 kubeutils/strings_test.go

diff --git a/.gitignore b/.gitignore
index 7234a56..9485ca3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -23,4 +23,5 @@ test-report.json
 junit*.xml
 *cov
 
-*.test
\ No newline at end of file
+*.test
+kubeutils/testdata/fuzz/
diff --git a/changelog/v0.7.1/sanitize.yaml b/changelog/v0.7.1/sanitize.yaml
new file mode 100644
index 0000000..1ee24b5
--- /dev/null
+++ b/changelog/v0.7.1/sanitize.yaml
@@ -0,0 +1,3 @@
+changelog:
+  - type: NON_USER_FACING
+    description: Fix and make santize faster
\ No newline at end of file
diff --git a/kubeutils/strings.go b/kubeutils/strings.go
new file mode 100644
index 0000000..7645105
--- /dev/null
+++ b/kubeutils/strings.go
@@ -0,0 +1,105 @@
+package kubeutils
+
+import (
+	"bytes"
+	"crypto/md5"
+	"encoding/hex"
+	"hash"
+	"sync"
+)
+
+// This is an extrememly important number for these shortened names.
+// It signifies the spot of separation from the original name and the hash.
+// It is used for many string building and parsing operations below.
+const magicNumber = 31
+const totalSize = magicNumber*2 + 1
+const encodedMd5 = 2 * md5.Size
+
+const separator = '-'
+
+// We can short-circuit the comparison if the first 31 characters are not equal.
+// Otherweise we need to compare the shortened version of the strings.
+func ShortenedEquals(shortened, standard string) bool {
+
+	// If the standard string is less than 63 characters, we can just compare the strings.
+	if len(standard) <= totalSize {
+		return shortened == standard
+	}
+
+	// If the shortened string is less than or equal to 32 characters, we can just compare the strings.
+	// Also if it's less than 32 the below checks may crash.
+	if len(shortened) <= magicNumber+1 {
+		return shortened == standard
+	}
+
+	// Check the first 31 characters, if they're not equal we can exit early.
+	if shortened[:magicNumber] != standard[:magicNumber] {
+		return false
+	}
+
+	// If 32nd character of the shortened string is not a '-' or the 32nd character of the standard string is not a '-'
+	// we can exit early.
+	// In theory this shouldn't be necessary, but this label can technically be modified by the user,
+	// so it's safer to double check.
+	if shortened[magicNumber] != separator {
+		return false
+	}
+
+	// Check the last 32 characters of the shortened string against the hash of the standard string.
+	hashed := hashName(standard)
+	return shortened[magicNumber+1:] == string(hashed[:magicNumber])
+}
+
+// shortenName is extrememly inefficient with it's allocation of slices for hashing.
+// We can re-use the arrays to avoid this allocation. However, this code may be called
+// from multiple go-routines simultaneously so we must house these objects in sync.Pools
+
+// Pool of MD5 hashers to avoid allocation.
+var md5HasherPool = sync.Pool{
+	New: func() interface{} {
+		return md5.New()
+	},
+}
+
+// Pool of string builders to avoid allocation.
+var byteBufferPool = sync.Pool{
+	New: func() interface{} {
+		b := &bytes.Buffer{}
+		b.Grow(totalSize)
+		return b
+	},
+}
+
+// hashName returns a hash of the input string in base 16 format
+// This function is optimized for speed and memory usage.
+// It should aboid nearly all allocations by re-using the same buffers whenever possible.
+func hashName(name string) [encodedMd5]byte {
+	hasher := md5HasherPool.Get().(hash.Hash)
+	hasher.Reset()
+	hasher.Write([]byte(name))
+	hashArray := [md5.Size]byte{}
+	hash := hasher.Sum(hashArray[:0])
+	// Cannot use hex.EncodedLen() here because it's a func, but it just returns 2 * len(src)
+	hashBufferArray := [encodedMd5]byte{}
+	hex.Encode(hashBufferArray[:], hash)
+	md5HasherPool.Put(hasher)
+	return hashBufferArray
+}
+
+// shortenName returns a shortened version of the input string.
+// It is based on the `kubeutils.SanitizeNameV2` function, but it
+// just does the shortening part.
+func ShortenName(name string) string {
+	if len(name) > totalSize {
+		hash := hashName(name)
+		builder := byteBufferPool.Get().(*bytes.Buffer)
+		builder.Reset()
+		builder.Grow(totalSize)
+		builder.WriteString(name[:magicNumber])
+		builder.WriteRune(separator)
+		builder.Write(hash[:magicNumber])
+		name = builder.String()
+		byteBufferPool.Put(builder)
+	}
+	return name
+}
diff --git a/kubeutils/strings_test.go b/kubeutils/strings_test.go
new file mode 100644
index 0000000..2aa2d49
--- /dev/null
+++ b/kubeutils/strings_test.go
@@ -0,0 +1,137 @@
+package kubeutils
+
+import (
+	"crypto/md5"
+	"fmt"
+	"testing"
+)
+
+// This function is a copy of the old version of this function.
+// It is used to ensure parity with the old implementation.
+func shortenName(name string) string {
+	if len(name) > 63 {
+		hash := md5.Sum([]byte(name))
+		name = fmt.Sprintf("%s-%x", name[:31], hash)
+		name = name[:63]
+	}
+	return name
+}
+
+func BenchmarkShortenEqual(b *testing.B) {
+	b.Run("shorten name old", func(b *testing.B) {
+		for i := 0; i < b.N; i++ {
+			shortenName("jfdklanfkljasfhjhldacaslkhdfkjshfkjsadhfkjasdhgjadhgkdahfjkdahjfdsagdfhjdsagfhasjdfsdfasfsafsdf")
+		}
+	})
+
+	b.Run("shortened equals--worst case", func(b *testing.B) {
+		shortened := "jfdklanfkljasfhjhldacaslkhdfkjs-f1e0028d0fbfe9afbd1a8bb9b53848d"
+		standard := "jfdklanfkljasfhjhldacaslkhdfkjshfkjsadhfkjasdhgjadhgkdahfjkdahjfdsagdfhjdsagfhasjdfsdfasfsafsdf"
+		for i := 0; i < b.N; i++ {
+			ShortenedEquals(shortened, standard)
+		}
+	})
+
+	b.Run("shortened equals--different prefix", func(b *testing.B) {
+		shortened := "jfdklanfkljasfhjhlxacaslkhdfkjs-f1e0028d0fbfe9afbd1a8bb9b53848d"
+		standard := "jfdklanfkljasfhjhldacaslkhdfkjshfkjsadhfkjasdhgjadhgkdahfjkdahjfdsagdfhjdsagfhasjdfsdfasfsafsdf"
+		for i := 0; i < b.N; i++ {
+			ShortenedEquals(shortened, standard)
+		}
+	})
+
+	b.Run("shortened equals--less than 63 characters", func(b *testing.B) {
+		shortened := "hello"
+		standard := "hello"
+		for i := 0; i < b.N; i++ {
+			ShortenedEquals(shortened, standard)
+		}
+	})
+}
+
+func FuzzShortNameParity(f *testing.F) {
+	// Random string < 63
+	f.Add("hello")
+	// Random string > 63
+	f.Add("jfdklanfkljasfhjhldacaslkhdfkjshfkjsadhfkjasdhgjadhgkdahfjkdahjfdsagdfhjdsagfhasjdfsdfasfsafsdf")
+	f.Fuzz(func(t *testing.T, a string) {
+		oldName := shortenName(a)
+		newName := ShortenName(a)
+		if oldName != newName {
+			t.Fatalf("shortenName(%s) = %s, ShortenName(%s) = %s", a, oldName, a, newName)
+		}
+
+		equal := ShortenedEquals(newName, a)
+		if !equal {
+			t.Fatalf("ShortenedEquals(%s, %s) = %t", newName, a, equal)
+		}
+	})
+}
+
+func TestShortenName(t *testing.T) {
+	t.Run("shorten name < 63", func(t *testing.T) {
+		name := "hello"
+		shortened := ShortenName(name)
+		if shortened != name {
+			t.Fatalf("ShortenName(%s) = %s", name, shortened)
+		}
+	})
+
+	t.Run("shorten name > 63", func(t *testing.T) {
+		name := "jfdklanfkljasfhjhldacaslkhdfkjshfkjsadhfkjasdhgjadhgkdahfjkdahjfdsagdfhjdsagfhasjdfsdfasfsafsdf"
+		shortened := ShortenName(name)
+		if len(shortened) != 63 {
+			t.Fatalf("ShortenName(%s) = %s", name, shortened)
+		}
+
+		if shortened != "jfdklanfkljasfhjhldacaslkhdfkjs-f1e0028d0fbfe9afbd1a8bb9b53848d" {
+			t.Fatalf("ShortenName(%s) = %s", name, shortened)
+		}
+	})
+}
+
+func TestShortenedEquals(t *testing.T) {
+
+	testCases := []struct {
+		name      string
+		shortened string
+		equal     bool
+	}{
+		{
+			name:      "hello",
+			shortened: "hello",
+			equal:     true,
+		},
+		{
+			name:      "jfdklanfkljasfhjhldacaslkhdfkjshfkjsadhfkjasdhgjadhgkdahfjkdahjfdsagdfhjdsagfhasjdfsdfasfsafsdf",
+			shortened: "jfdklanfkljasfhjhldacaslkhdfkjs-f1e0028d0fbfe9afbd1a8bb9b53848d",
+			equal:     true,
+		},
+		{
+			name:      "jfdklanfkljasfhjhldacaslkhdfkjshfkjsadhfkjasdhgjadhgkdahfjkdahjfdsagdfhjdsagfhasjdfsdfasfsafsdf",
+			shortened: "jfdklanfkljasfhjhldacaslkhdfkjs-f1e0028d0fbfe9afbd1a8bb9b53848",
+			equal:     false,
+		},
+		{
+			name:      "jfdklanfkljasfhjhldacaslkhdfkjshfkjsadhfkjasdhgjadhgkdahfjkdahjfdsagdfhjdsagfhasjdfsdfasfsafsdf",
+			shortened: "jfdklanfkljasfhjhldacaslkhdfkjsf1e0028d0fbfe9afbd1a8bb9b53848ds",
+			equal:     false,
+		},
+		{
+			name:      "jfdklanfkljasfhjhldacaslkhdfkjshfkjsadhfkjasdhgjadhgkdahfjkdahjfdsagdfhjdsagfhasjdfsdfasfsafsdf",
+			shortened: "jfdklanfkjasfhjhldacaslkhdfkjs-f1e0028d0fbfe9afbd1a8bb9b53848ds",
+			equal:     false,
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			equal := ShortenedEquals(tc.shortened, tc.name)
+			if equal != tc.equal {
+				t.Fatalf("ShortenedEquals(%s, %s) = %t", tc.shortened, tc.name, equal)
+			}
+		})
+
+	}
+
+}
diff --git a/kubeutils/util.go b/kubeutils/util.go
index 78637aa..b4448f2 100644
--- a/kubeutils/util.go
+++ b/kubeutils/util.go
@@ -4,7 +4,6 @@ import (
 	"crypto/md5"
 	"fmt"
 	"strings"
-	"unicode"
 )
 
 // use SanitizeNameV2
@@ -36,12 +35,10 @@ func SanitizeNameV2(name string) string {
 		case '[', ']', '\n', '"', '\'':
 			return -1
 		}
-		return unicode.ToLower(r)
+		return r
 	}, name)
 	if len(name) > 63 {
-		hash := md5.Sum([]byte(name))
-		name = fmt.Sprintf("%s-%x", name[:31], hash)
-		name = name[:63]
+		name = ShortenName(name)
 	}
-	return name
+	return strings.ToLower(name)
 }
diff --git a/kubeutils/util_test.go b/kubeutils/util_test.go
index ca2891f..081f8dc 100644
--- a/kubeutils/util_test.go
+++ b/kubeutils/util_test.go
@@ -1,19 +1,48 @@
 package kubeutils
 
 import (
+	"crypto/md5"
+	"fmt"
+	"strings"
+	"testing"
 	"time"
+	"unicode/utf8"
 
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
 	"github.com/onsi/gomega/gmeasure"
 )
 
+// Here for fuzz tests..
+func sanitizeNameV2Old(name string) string {
+	name = strings.Replace(name, "*", "-", -1)
+	name = strings.Replace(name, "/", "-", -1)
+	name = strings.Replace(name, ".", "-", -1)
+	name = strings.Replace(name, "[", "", -1)
+	name = strings.Replace(name, "]", "", -1)
+	name = strings.Replace(name, ":", "-", -1)
+	name = strings.Replace(name, "_", "-", -1)
+	name = strings.Replace(name, " ", "-", -1)
+	name = strings.Replace(name, "\n", "", -1)
+	name = strings.Replace(name, "\"", "", -1)
+	name = strings.Replace(name, "'", "", -1)
+	if len(name) > 63 {
+		hash := md5.Sum([]byte(name))
+		name = fmt.Sprintf("%s-%x", name[:31], hash)
+		name = name[:63]
+	}
+	name = strings.Replace(name, ".", "-", -1)
+	name = strings.ToLower(name)
+	return name
+}
+
 var _ = Describe("sanitize name", func() {
 
 	DescribeTable("sanitize short names", func(in, out string) {
 		Expect(SanitizeNameV2(in)).To(Equal(out))
 	},
 		Entry("basic a", "abc", "abc"),
+		Entry("basic A", "Abc", "abc"),
 		Entry("basic b", "abc123", "abc123"),
 		Entry("subX *", "bb*", "bb-"),
 		Entry("sub *", "bb*b", "bb-b"),
@@ -63,3 +92,44 @@ var _ = Describe("sanitize name", func() {
 		}, gmeasure.SamplingConfig{N: 200, Duration: time.Minute})
 	})
 })
+
+func FuzzSanitizeNameParity(f *testing.F) {
+	// Random string < 63
+	f.Add("VirtualGateway-istio-ingressgateway-bookinfo-cluster-1-istio-ingressgateway-istio-gateway-ns-cluster-1-gloo-mesh-cluster-1-HTTPS.443-anything")
+	f.Add("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa")
+	f.Add("abc")
+	f.Add("abc123")
+	f.Add("bb*")
+	f.Add("bb*b")
+	f.Add("bb/")
+	f.Add("bb/b")
+	f.Add("bb.")
+	f.Add("bb.b")
+	f.Add("bb[")
+	f.Add("bb[b")
+	f.Add("bb]")
+	f.Add("bb]b")
+	f.Add("bb:")
+	f.Add("bb:b")
+	f.Add("bb ")
+	f.Add("bb b")
+	f.Add("bb\n")
+	f.Add("bb\nb")
+	f.Add("aa\"")
+	f.Add("bb\"b")
+	f.Add("aa'")
+	f.Add("bb'b")
+	f.Add("jfdklanfkljasfhjhldacaslkhdfkjshfkjsadhfkjasdhgjadhgkdahfjkdahjfdsagdfhjdsagfhasjdfsdfasfsafsdf")
+
+	f.Fuzz(func(t *testing.T, a string) {
+		// we can only  get a valid kube name that's alphanumeric
+		if !utf8.Valid([]byte(a)) {
+			t.Skip("Skipping non-valid utf8 input")
+		}
+		oldName := SanitizeNameV2(a)
+		newName := sanitizeNameV2Old(a)
+		if oldName != newName {
+			t.Fatalf("SanitizeNameV2(%s) = %s, SanitizeNameV2Old(%s) = %s", a, oldName, a, newName)
+		}
+	})
+}