-
Notifications
You must be signed in to change notification settings - Fork 526
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Revert "Drop intern package and lift method to parquetquery package"
This reverts commit 76876fe.
- Loading branch information
Showing
3 changed files
with
105 additions
and
12 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
// Package intern is a utility for interning byte slices for pq.Value's. | ||
// It is not safe for concurrent use. | ||
// | ||
// The Interner is used to intern byte slices for pq.Value's. This is useful | ||
// for reducing memory usage and improving performance when working with | ||
// large datasets with many repeated strings. | ||
package intern | ||
|
||
import ( | ||
"unique" | ||
|
||
pq "github.com/parquet-go/parquet-go" | ||
) | ||
|
||
type Interner struct{} | ||
|
||
func New() *Interner { | ||
return &Interner{} | ||
} | ||
|
||
func (i *Interner) UnsafeClone(v *pq.Value) pq.Value { | ||
switch v.Kind() { | ||
case pq.ByteArray, pq.FixedLenByteArray: | ||
return *unique.Make(v).Value() | ||
default: | ||
return *v | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
package intern | ||
|
||
import ( | ||
"fmt" | ||
"testing" | ||
|
||
pq "github.com/parquet-go/parquet-go" | ||
) | ||
|
||
func TestInterner_UnsafeClone(t *testing.T) { | ||
i := New() | ||
|
||
value1 := pq.ByteArrayValue([]byte("foo")) | ||
value2 := pq.ByteArrayValue([]byte("foo")) | ||
|
||
clone1 := i.UnsafeClone(&value1) | ||
clone2 := i.UnsafeClone(&value2) | ||
|
||
if clone1.ByteArray()[0] != clone2.ByteArray()[0] { | ||
// Values are interned, so the memory address should be the same | ||
t.Error("expected same memory address") | ||
} | ||
|
||
if value1.ByteArray()[0] != value2.ByteArray()[0] { | ||
// Mutates the original value, so the memory address should be different as well | ||
t.Error("expected same memory address") | ||
} | ||
} | ||
|
||
func BenchmarkIntern(b *testing.B) { | ||
words := []string{"foo", "bar", "baz", "qux", "quux", "corge", "grault", "garply", "waldo", "fred", "plugh", "xyzzy", "thud"} | ||
testCases := []struct { | ||
name string | ||
valueFn func(i int) pq.Value | ||
}{ | ||
{ | ||
name: "byte_array", | ||
valueFn: func(i int) pq.Value { return pq.ByteArrayValue([]byte(words[i%len(words)])) }, | ||
}, | ||
{ | ||
name: "fixed_len_byte_array", | ||
valueFn: func(i int) pq.Value { return pq.FixedLenByteArrayValue([]byte(words[i%len(words)])) }, | ||
}, | ||
{ | ||
name: "bool", | ||
valueFn: func(i int) pq.Value { return pq.BooleanValue(i%2 == 0) }, | ||
}, | ||
{ | ||
name: "int32", | ||
valueFn: func(i int) pq.Value { return pq.Int32Value(int32(i)) }, | ||
}, | ||
} | ||
|
||
for _, tc := range testCases { | ||
b.Run(fmt.Sprintf("no_interning: %s", tc.name), func(b *testing.B) { | ||
for i := 0; i < b.N; i++ { | ||
value := tc.valueFn(i) | ||
_ = value.Clone() | ||
} | ||
}) | ||
|
||
b.Run(fmt.Sprintf("interning: %s", tc.name), func(b *testing.B) { | ||
interner := New() | ||
|
||
b.ResetTimer() | ||
for i := 0; i < b.N; i++ { | ||
value := tc.valueFn(i) | ||
_ = interner.UnsafeClone(&value) | ||
} | ||
}) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters