From 30d85dfab3ef6a6da51485450cd5122efa5c482c Mon Sep 17 00:00:00 2001 From: Yuval Kohavi Date: Fri, 3 May 2024 16:23:32 -0400 Subject: [PATCH] fix edge case with upper case --- .gitignore | 3 +- changelog/v0.7.1/sanitize.yaml | 3 + kubeutils/strings.go | 105 +++++++++++++++++++++++++ kubeutils/strings_test.go | 137 +++++++++++++++++++++++++++++++++ kubeutils/util.go | 9 +-- kubeutils/util_test.go | 70 +++++++++++++++++ 6 files changed, 320 insertions(+), 7 deletions(-) create mode 100644 changelog/v0.7.1/sanitize.yaml create mode 100644 kubeutils/strings.go create mode 100644 kubeutils/strings_test.go diff --git a/.gitignore b/.gitignore index 7234a56..9485ca3 100644 --- a/.gitignore +++ b/.gitignore @@ -23,4 +23,5 @@ test-report.json junit*.xml *cov -*.test \ No newline at end of file +*.test +kubeutils/testdata/fuzz/ diff --git a/changelog/v0.7.1/sanitize.yaml b/changelog/v0.7.1/sanitize.yaml new file mode 100644 index 0000000..1ee24b5 --- /dev/null +++ b/changelog/v0.7.1/sanitize.yaml @@ -0,0 +1,3 @@ +changelog: + - type: NON_USER_FACING + description: Fix and make santize faster \ No newline at end of file diff --git a/kubeutils/strings.go b/kubeutils/strings.go new file mode 100644 index 0000000..7645105 --- /dev/null +++ b/kubeutils/strings.go @@ -0,0 +1,105 @@ +package kubeutils + +import ( + "bytes" + "crypto/md5" + "encoding/hex" + "hash" + "sync" +) + +// This is an extrememly important number for these shortened names. +// It signifies the spot of separation from the original name and the hash. +// It is used for many string building and parsing operations below. +const magicNumber = 31 +const totalSize = magicNumber*2 + 1 +const encodedMd5 = 2 * md5.Size + +const separator = '-' + +// We can short-circuit the comparison if the first 31 characters are not equal. +// Otherweise we need to compare the shortened version of the strings. +func ShortenedEquals(shortened, standard string) bool { + + // If the standard string is less than 63 characters, we can just compare the strings. + if len(standard) <= totalSize { + return shortened == standard + } + + // If the shortened string is less than or equal to 32 characters, we can just compare the strings. + // Also if it's less than 32 the below checks may crash. + if len(shortened) <= magicNumber+1 { + return shortened == standard + } + + // Check the first 31 characters, if they're not equal we can exit early. + if shortened[:magicNumber] != standard[:magicNumber] { + return false + } + + // If 32nd character of the shortened string is not a '-' or the 32nd character of the standard string is not a '-' + // we can exit early. + // In theory this shouldn't be necessary, but this label can technically be modified by the user, + // so it's safer to double check. + if shortened[magicNumber] != separator { + return false + } + + // Check the last 32 characters of the shortened string against the hash of the standard string. + hashed := hashName(standard) + return shortened[magicNumber+1:] == string(hashed[:magicNumber]) +} + +// shortenName is extrememly inefficient with it's allocation of slices for hashing. +// We can re-use the arrays to avoid this allocation. However, this code may be called +// from multiple go-routines simultaneously so we must house these objects in sync.Pools + +// Pool of MD5 hashers to avoid allocation. +var md5HasherPool = sync.Pool{ + New: func() interface{} { + return md5.New() + }, +} + +// Pool of string builders to avoid allocation. +var byteBufferPool = sync.Pool{ + New: func() interface{} { + b := &bytes.Buffer{} + b.Grow(totalSize) + return b + }, +} + +// hashName returns a hash of the input string in base 16 format +// This function is optimized for speed and memory usage. +// It should aboid nearly all allocations by re-using the same buffers whenever possible. +func hashName(name string) [encodedMd5]byte { + hasher := md5HasherPool.Get().(hash.Hash) + hasher.Reset() + hasher.Write([]byte(name)) + hashArray := [md5.Size]byte{} + hash := hasher.Sum(hashArray[:0]) + // Cannot use hex.EncodedLen() here because it's a func, but it just returns 2 * len(src) + hashBufferArray := [encodedMd5]byte{} + hex.Encode(hashBufferArray[:], hash) + md5HasherPool.Put(hasher) + return hashBufferArray +} + +// shortenName returns a shortened version of the input string. +// It is based on the `kubeutils.SanitizeNameV2` function, but it +// just does the shortening part. +func ShortenName(name string) string { + if len(name) > totalSize { + hash := hashName(name) + builder := byteBufferPool.Get().(*bytes.Buffer) + builder.Reset() + builder.Grow(totalSize) + builder.WriteString(name[:magicNumber]) + builder.WriteRune(separator) + builder.Write(hash[:magicNumber]) + name = builder.String() + byteBufferPool.Put(builder) + } + return name +} diff --git a/kubeutils/strings_test.go b/kubeutils/strings_test.go new file mode 100644 index 0000000..2aa2d49 --- /dev/null +++ b/kubeutils/strings_test.go @@ -0,0 +1,137 @@ +package kubeutils + +import ( + "crypto/md5" + "fmt" + "testing" +) + +// This function is a copy of the old version of this function. +// It is used to ensure parity with the old implementation. +func shortenName(name string) string { + if len(name) > 63 { + hash := md5.Sum([]byte(name)) + name = fmt.Sprintf("%s-%x", name[:31], hash) + name = name[:63] + } + return name +} + +func BenchmarkShortenEqual(b *testing.B) { + b.Run("shorten name old", func(b *testing.B) { + for i := 0; i < b.N; i++ { + shortenName("jfdklanfkljasfhjhldacaslkhdfkjshfkjsadhfkjasdhgjadhgkdahfjkdahjfdsagdfhjdsagfhasjdfsdfasfsafsdf") + } + }) + + b.Run("shortened equals--worst case", func(b *testing.B) { + shortened := "jfdklanfkljasfhjhldacaslkhdfkjs-f1e0028d0fbfe9afbd1a8bb9b53848d" + standard := "jfdklanfkljasfhjhldacaslkhdfkjshfkjsadhfkjasdhgjadhgkdahfjkdahjfdsagdfhjdsagfhasjdfsdfasfsafsdf" + for i := 0; i < b.N; i++ { + ShortenedEquals(shortened, standard) + } + }) + + b.Run("shortened equals--different prefix", func(b *testing.B) { + shortened := "jfdklanfkljasfhjhlxacaslkhdfkjs-f1e0028d0fbfe9afbd1a8bb9b53848d" + standard := "jfdklanfkljasfhjhldacaslkhdfkjshfkjsadhfkjasdhgjadhgkdahfjkdahjfdsagdfhjdsagfhasjdfsdfasfsafsdf" + for i := 0; i < b.N; i++ { + ShortenedEquals(shortened, standard) + } + }) + + b.Run("shortened equals--less than 63 characters", func(b *testing.B) { + shortened := "hello" + standard := "hello" + for i := 0; i < b.N; i++ { + ShortenedEquals(shortened, standard) + } + }) +} + +func FuzzShortNameParity(f *testing.F) { + // Random string < 63 + f.Add("hello") + // Random string > 63 + f.Add("jfdklanfkljasfhjhldacaslkhdfkjshfkjsadhfkjasdhgjadhgkdahfjkdahjfdsagdfhjdsagfhasjdfsdfasfsafsdf") + f.Fuzz(func(t *testing.T, a string) { + oldName := shortenName(a) + newName := ShortenName(a) + if oldName != newName { + t.Fatalf("shortenName(%s) = %s, ShortenName(%s) = %s", a, oldName, a, newName) + } + + equal := ShortenedEquals(newName, a) + if !equal { + t.Fatalf("ShortenedEquals(%s, %s) = %t", newName, a, equal) + } + }) +} + +func TestShortenName(t *testing.T) { + t.Run("shorten name < 63", func(t *testing.T) { + name := "hello" + shortened := ShortenName(name) + if shortened != name { + t.Fatalf("ShortenName(%s) = %s", name, shortened) + } + }) + + t.Run("shorten name > 63", func(t *testing.T) { + name := "jfdklanfkljasfhjhldacaslkhdfkjshfkjsadhfkjasdhgjadhgkdahfjkdahjfdsagdfhjdsagfhasjdfsdfasfsafsdf" + shortened := ShortenName(name) + if len(shortened) != 63 { + t.Fatalf("ShortenName(%s) = %s", name, shortened) + } + + if shortened != "jfdklanfkljasfhjhldacaslkhdfkjs-f1e0028d0fbfe9afbd1a8bb9b53848d" { + t.Fatalf("ShortenName(%s) = %s", name, shortened) + } + }) +} + +func TestShortenedEquals(t *testing.T) { + + testCases := []struct { + name string + shortened string + equal bool + }{ + { + name: "hello", + shortened: "hello", + equal: true, + }, + { + name: "jfdklanfkljasfhjhldacaslkhdfkjshfkjsadhfkjasdhgjadhgkdahfjkdahjfdsagdfhjdsagfhasjdfsdfasfsafsdf", + shortened: "jfdklanfkljasfhjhldacaslkhdfkjs-f1e0028d0fbfe9afbd1a8bb9b53848d", + equal: true, + }, + { + name: "jfdklanfkljasfhjhldacaslkhdfkjshfkjsadhfkjasdhgjadhgkdahfjkdahjfdsagdfhjdsagfhasjdfsdfasfsafsdf", + shortened: "jfdklanfkljasfhjhldacaslkhdfkjs-f1e0028d0fbfe9afbd1a8bb9b53848", + equal: false, + }, + { + name: "jfdklanfkljasfhjhldacaslkhdfkjshfkjsadhfkjasdhgjadhgkdahfjkdahjfdsagdfhjdsagfhasjdfsdfasfsafsdf", + shortened: "jfdklanfkljasfhjhldacaslkhdfkjsf1e0028d0fbfe9afbd1a8bb9b53848ds", + equal: false, + }, + { + name: "jfdklanfkljasfhjhldacaslkhdfkjshfkjsadhfkjasdhgjadhgkdahfjkdahjfdsagdfhjdsagfhasjdfsdfasfsafsdf", + shortened: "jfdklanfkjasfhjhldacaslkhdfkjs-f1e0028d0fbfe9afbd1a8bb9b53848ds", + equal: false, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + equal := ShortenedEquals(tc.shortened, tc.name) + if equal != tc.equal { + t.Fatalf("ShortenedEquals(%s, %s) = %t", tc.shortened, tc.name, equal) + } + }) + + } + +} diff --git a/kubeutils/util.go b/kubeutils/util.go index 78637aa..b4448f2 100644 --- a/kubeutils/util.go +++ b/kubeutils/util.go @@ -4,7 +4,6 @@ import ( "crypto/md5" "fmt" "strings" - "unicode" ) // use SanitizeNameV2 @@ -36,12 +35,10 @@ func SanitizeNameV2(name string) string { case '[', ']', '\n', '"', '\'': return -1 } - return unicode.ToLower(r) + return r }, name) if len(name) > 63 { - hash := md5.Sum([]byte(name)) - name = fmt.Sprintf("%s-%x", name[:31], hash) - name = name[:63] + name = ShortenName(name) } - return name + return strings.ToLower(name) } diff --git a/kubeutils/util_test.go b/kubeutils/util_test.go index ca2891f..081f8dc 100644 --- a/kubeutils/util_test.go +++ b/kubeutils/util_test.go @@ -1,19 +1,48 @@ package kubeutils import ( + "crypto/md5" + "fmt" + "strings" + "testing" "time" + "unicode/utf8" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" "github.com/onsi/gomega/gmeasure" ) +// Here for fuzz tests.. +func sanitizeNameV2Old(name string) string { + name = strings.Replace(name, "*", "-", -1) + name = strings.Replace(name, "/", "-", -1) + name = strings.Replace(name, ".", "-", -1) + name = strings.Replace(name, "[", "", -1) + name = strings.Replace(name, "]", "", -1) + name = strings.Replace(name, ":", "-", -1) + name = strings.Replace(name, "_", "-", -1) + name = strings.Replace(name, " ", "-", -1) + name = strings.Replace(name, "\n", "", -1) + name = strings.Replace(name, "\"", "", -1) + name = strings.Replace(name, "'", "", -1) + if len(name) > 63 { + hash := md5.Sum([]byte(name)) + name = fmt.Sprintf("%s-%x", name[:31], hash) + name = name[:63] + } + name = strings.Replace(name, ".", "-", -1) + name = strings.ToLower(name) + return name +} + var _ = Describe("sanitize name", func() { DescribeTable("sanitize short names", func(in, out string) { Expect(SanitizeNameV2(in)).To(Equal(out)) }, Entry("basic a", "abc", "abc"), + Entry("basic A", "Abc", "abc"), Entry("basic b", "abc123", "abc123"), Entry("subX *", "bb*", "bb-"), Entry("sub *", "bb*b", "bb-b"), @@ -63,3 +92,44 @@ var _ = Describe("sanitize name", func() { }, gmeasure.SamplingConfig{N: 200, Duration: time.Minute}) }) }) + +func FuzzSanitizeNameParity(f *testing.F) { + // Random string < 63 + f.Add("VirtualGateway-istio-ingressgateway-bookinfo-cluster-1-istio-ingressgateway-istio-gateway-ns-cluster-1-gloo-mesh-cluster-1-HTTPS.443-anything") + f.Add("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa") + f.Add("abc") + f.Add("abc123") + f.Add("bb*") + f.Add("bb*b") + f.Add("bb/") + f.Add("bb/b") + f.Add("bb.") + f.Add("bb.b") + f.Add("bb[") + f.Add("bb[b") + f.Add("bb]") + f.Add("bb]b") + f.Add("bb:") + f.Add("bb:b") + f.Add("bb ") + f.Add("bb b") + f.Add("bb\n") + f.Add("bb\nb") + f.Add("aa\"") + f.Add("bb\"b") + f.Add("aa'") + f.Add("bb'b") + f.Add("jfdklanfkljasfhjhldacaslkhdfkjshfkjsadhfkjasdhgjadhgkdahfjkdahjfdsagdfhjdsagfhasjdfsdfasfsafsdf") + + f.Fuzz(func(t *testing.T, a string) { + // we can only get a valid kube name that's alphanumeric + if !utf8.Valid([]byte(a)) { + t.Skip("Skipping non-valid utf8 input") + } + oldName := SanitizeNameV2(a) + newName := sanitizeNameV2Old(a) + if oldName != newName { + t.Fatalf("SanitizeNameV2(%s) = %s, SanitizeNameV2Old(%s) = %s", a, oldName, a, newName) + } + }) +}