Use Prometheus fast regexp (#4329)

* basic integration Signed-off-by: Joe Elliott <[email protected]> * patch tests for new meaning Signed-off-by: Joe Elliott <[email protected]> * patch up more tests Signed-off-by: Joe Elliott <[email protected]> * add basic tests Signed-off-by: Joe Elliott <[email protected]> * changelog + docs Signed-off-by: Joe Elliott <[email protected]> * remove benches Signed-off-by: Joe Elliott <[email protected]> * Cleaned up + tests Signed-off-by: Joe Elliott <[email protected]> * comment Signed-off-by: Joe Elliott <[email protected]> * lint Signed-off-by: Joe Elliott <[email protected]> * Update docs/sources/tempo/traceql/_index.md Co-authored-by: Kim Nylander <[email protected]> * comment Signed-off-by: Joe Elliott <[email protected]> --------- Signed-off-by: Joe Elliott <[email protected]> Co-authored-by: Kim Nylander <[email protected]>
grafana · Nov 20, 2024 · b61c3ce · b61c3ce
1 parent 7073d04
commit b61c3ce
Show file tree

Hide file tree

Showing 9 changed files with 214 additions and 60 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -60,6 +60,8 @@
 * [ENHANCEMENT] Add `invalid_utf8` to reasons spanmetrics will discard spans. [#4293](https://github.com/grafana/tempo/pull/4293) (@zalegrala)
 * [ENHANCEMENT] Reduce frontend and querier allocations by dropping HTTP headers early in the pipeline. [#4298](https://github.com/grafana/tempo/pull/4298) (@joe-elliott)
 * [ENHANCEMENT] Reduce ingester working set by improving prelloc behavior. [#4344](https://github.com/grafana/tempo/pull/4344)  (@joe-elliott)
+* [ENHANCEMENT] Use Promtheus fast regexp for TraceQL regular expression matchers. [#4329](https://github.com/grafana/tempo/pull/4329) (@joe-elliott)
+  **BREAKING CHANGE** All regular expression matchers will now be fully anchored. `span.foo =~ "bar"` will now be evaluated as `span.foo =~ "^bar$"`
 * [BUGFIX] Replace hedged requests roundtrips total with a counter. [#4063](https://github.com/grafana/tempo/pull/4063) [#4078](https://github.com/grafana/tempo/pull/4078) (@galalen)
 * [BUGFIX] Metrics generators: Correctly drop from the ring before stopping ingestion to reduce drops during a rollout. [#4101](https://github.com/grafana/tempo/pull/4101) (@joe-elliott)
 * [BUGFIX] Correctly handle 400 Bad Request and 404 Not Found in gRPC streaming [#4144](https://github.com/grafana/tempo/pull/4144) (@mapno)

diff --git a/docs/sources/tempo/traceql/_index.md b/docs/sources/tempo/traceql/_index.md
@@ -254,7 +254,8 @@ The implemented comparison operators are:
 - `=~` (regular expression)
 - `!~` (negated regular expression)
 
-TraceQL uses Golang regular expressions. Online regular expression testing sites like https://regex101.com/ are convenient to validate regular expressions used in TraceQL queries.
+TraceQL uses Golang regular expressions. Online regular expression testing sites like https://regex101.com/ are convenient to validate regular expressions used in TraceQL queries. All regular expressions
+are treated as fully anchored. For example, `span.foo =~ "bar"` is evaluated as `span.foo =~ "^bar$"`.
 
 For example, to find all traces where an `http.status_code` attribute in a span are greater than `400` but less than equal to `500`:
 

diff --git a/pkg/parquetquery/predicates.go b/pkg/parquetquery/predicates.go
@@ -3,9 +3,10 @@ package parquetquery
 import (
 	"bytes"
 	"fmt"
-	"regexp"
 	"strings"
 
+	"github.com/grafana/tempo/pkg/regexp"
+
 	pq "github.com/parquet-go/parquet-go"
 )
 
@@ -88,9 +89,7 @@ func (p *StringInPredicate) KeepPage(pq.Page) bool {
 }
 
 type regexPredicate struct {
-	regs        []*regexp.Regexp
-	matches     map[string]bool
-	shouldMatch bool
+	matcher *regexp.Regexp
 }
 
 var _ Predicate = (*regexPredicate)(nil)
@@ -108,66 +107,33 @@ func NewRegexNotInPredicate(regs []string) (Predicate, error) {
 }
 
 func newRegexPredicate(regs []string, shouldMatch bool) (Predicate, error) {
-	p := &regexPredicate{
-		regs:        make([]*regexp.Regexp, 0, len(regs)),
-		matches:     make(map[string]bool),
-		shouldMatch: shouldMatch,
-	}
-	for _, reg := range regs {
-		r, err := regexp.Compile(reg)
-		if err != nil {
-			return nil, err
-		}
-		p.regs = append(p.regs, r)
+	m, err := regexp.NewRegexp(regs, shouldMatch)
+	if err != nil {
+		return nil, err
 	}
-	return p, nil
+
+	return &regexPredicate{
+		matcher: m,
+	}, nil
 }
 
 func (p *regexPredicate) String() string {
-	var strings string
-	for _, s := range p.regs {
-		strings += fmt.Sprintf("%s, ", s.String())
-	}
-	return fmt.Sprintf("RegexInPredicate{%s}", strings)
+	return fmt.Sprintf("RegexPredicate{%s}", p.matcher.String())
 }
 
 func (p *regexPredicate) keep(v *pq.Value) bool {
 	if v.IsNull() {
 		return false
 	}
 
-	b := v.ByteArray()
-
-	// Check uses zero alloc optimization of map[string([]byte)]
-	if matched, ok := p.matches[string(b)]; ok {
-		return matched
-	}
-
-	matched := false
-	for _, r := range p.regs {
-		if r.Match(b) == p.shouldMatch {
-			matched = true
-			break
-		}
-	}
-
-	// Only alloc the string when updating the map
-	p.matches[string(b)] = matched
-
-	return matched
+	return p.matcher.Match(v.ByteArray())
 }
 
 func (p *regexPredicate) KeepColumnChunk(cc *ColumnChunkHelper) bool {
 	d := cc.Dictionary()
 
-	// Reset match cache on each row group change
-	// Use exact size of the incoming dictionary
-	// if present and larger.
-	count := len(p.matches)
-	if d != nil && d.Len() > count {
-		count = d.Len()
-	}
-	p.matches = make(map[string]bool, count)
+	// should we do this?
+	p.matcher.Reset()
 
 	if d != nil {
 		return keepDictionary(d, p.KeepValue)

diff --git a/pkg/regexp/regexp.go b/pkg/regexp/regexp.go
@@ -0,0 +1,120 @@
+package regexp
+
+import (
+	"fmt"
+	"regexp"
+	"unsafe"
+
+	"github.com/prometheus/prometheus/model/labels"
+)
+
+// in order to prevent building an enormous map on extremely high cardinality fields we institute a max
+// this number is not tuned. on extremely high cardinality fields memoization is wasteful b/c we rarely
+// see the same value 2x. this is all overhead. on lower cardinality fields with an expensive regex memoization
+// is very effective at speeding up queries.
+const maxMemoize = 1000
+
+type Regexp struct {
+	matchers    []*labels.FastRegexMatcher
+	matches     map[string]bool
+	shouldMatch bool
+}
+
+func NewRegexp(regexps []string, shouldMatch bool) (*Regexp, error) {
+	matchers := make([]*labels.FastRegexMatcher, 0, len(regexps))
+
+	for _, r := range regexps {
+		m, err := labels.NewFastRegexMatcher(r)
+		if err != nil {
+			return nil, err
+		}
+		matchers = append(matchers, m)
+	}
+
+	// only memoize if there's a unoptimized matcher
+	// TODO: should we limit memoization to N values?
+	var matches map[string]bool
+	for _, m := range matchers {
+		if shouldMemoize(m) {
+			matches = make(map[string]bool)
+			break
+		}
+	}
+
+	return &Regexp{
+		matchers:    matchers,
+		matches:     matches,
+		shouldMatch: shouldMatch,
+	}, nil
+}
+
+func (r *Regexp) Match(b []byte) bool {
+	return r.MatchString(unsafe.String(unsafe.SliceData(b), len(b)))
+}
+
+func (r *Regexp) MatchString(s string) bool {
+	// if we're memoizing check existing matches
+	if r.matches != nil {
+		if matched, ok := r.matches[s]; ok {
+			return matched
+		}
+	}
+
+	matched := false
+	for _, m := range r.matchers {
+		if m.MatchString(s) == r.shouldMatch {
+			matched = true
+			break
+		}
+	}
+
+	if r.matches != nil && len(r.matches) < maxMemoize {
+		r.matches[s] = matched
+	}
+
+	return matched
+}
+
+func (r *Regexp) Reset() {
+	if r.matches != nil {
+		clear(r.matches)
+	}
+}
+
+func (r *Regexp) String() string {
+	var strings string
+	for _, m := range r.matchers {
+		strings += fmt.Sprintf("%s, ", m.GetRegexString())
+	}
+
+	return strings
+}
+
+// shouldMemoize returns true if we believe that memoizing this regex would be faster
+// the evaluating it directly. see thoughts below.
+func shouldMemoize(m *labels.FastRegexMatcher) bool {
+	// matches labels.FastRegexMatcher
+	type cheatToSeeInternals struct {
+		reString string
+		re       *regexp.Regexp
+
+		setMatches    []string
+		stringMatcher labels.StringMatcher
+		prefix        string
+		suffix        string
+		contains      []string
+
+		matchString func(string) bool
+	}
+
+	cheat := (*cheatToSeeInternals)(unsafe.Pointer(m))
+
+	// TODO: research and improve this. we're making a guess on whether an optimization will improve the regex
+	// performance enough that its faster to not memoize. See compileMatchStringFunction() in the labels
+	// package. maybe there's even a way to do this dynamically?
+	return cheat.stringMatcher == nil && // a stringMatcher definitively rejects or accepts. if a string matcher is present the regex will never be executed
+		len(cheat.setMatches) == 0 && // setMatches definitively reject or accept. if len != 0 the regex will never be executed, but perhaps if there are a very large # of setMatches we prefer memoizing anyway?
+		cheat.prefix == "" && // prefix and suffix _do not_ prevent the regex from executing, but they are quick to evaluate and tend to nicely filter down.
+		cheat.suffix == "" // perhaps a length requirement would be an improvement? i.e. require a prefix or suffix of at least 3 chars?
+	// len(cheat.contains) == 0 // in testing, it was faster to memoize with a contains filter. perhaps if the filters are long enough we don't memoize?
+}
diff --git a/pkg/regexp/regexp_test.go b/pkg/regexp/regexp_test.go
@@ -0,0 +1,64 @@
+package regexp
+
+import (
+	"testing"
+
+	"github.com/prometheus/prometheus/model/labels"
+	"github.com/stretchr/testify/require"
+)
+
+func TestRegexpMatch(t *testing.T) {
+	r, err := NewRegexp([]string{"^a.*"}, true)
+	require.NoError(t, err)
+
+	require.True(t, r.Match([]byte("abc")))
+	require.True(t, r.MatchString("abc"))
+	require.False(t, r.MatchString("xyz"))
+
+	r, err = NewRegexp([]string{"^a.*"}, false)
+	require.NoError(t, err)
+
+	require.False(t, r.Match([]byte("abc")))
+	require.False(t, r.MatchString("abc"))
+	require.True(t, r.MatchString("xyz"))
+}
+
+func TestShouldMemoize(t *testing.T) {
+	tcs := []struct {
+		regex         string
+		shouldMemoize bool
+	}{
+		{
+			regex:         ".*",
+			shouldMemoize: false,
+		},
+		{
+			regex:         "foo.*",
+			shouldMemoize: false,
+		},
+		{
+			regex:         ".*bar",
+			shouldMemoize: false,
+		},
+		{
+			regex:         "foo|bar",
+			shouldMemoize: false,
+		},
+		{
+			regex:         ".*bar.*", // creates a containsStringMatcher so should not memoize
+			shouldMemoize: false,
+		},
+		{
+			regex:         ".*bar.*foo.*", // calls contains in order
+			shouldMemoize: true,
+		},
+	}
+
+	for _, tc := range tcs {
+		t.Run(tc.regex, func(t *testing.T) {
+			m, err := labels.NewFastRegexMatcher(tc.regex)
+			require.NoError(t, err)
+			require.Equal(t, tc.shouldMemoize, shouldMemoize(m))
+		})
+	}
+}
diff --git a/pkg/traceql/ast.go b/pkg/traceql/ast.go
@@ -6,11 +6,11 @@ import (
 	"fmt"
 	"hash/fnv"
 	"math"
-	"regexp"
 	"slices"
 	"time"
 	"unsafe"
 
+	"github.com/grafana/tempo/pkg/regexp"
 	"github.com/grafana/tempo/pkg/tempopb"
 )
 

diff --git a/pkg/traceql/ast_execute.go b/pkg/traceql/ast_execute.go
@@ -4,8 +4,9 @@ import (
 	"errors"
 	"fmt"
 	"math"
-	"regexp"
 	"strings"
+
+	"github.com/grafana/tempo/pkg/regexp"
 )
 
 var errSpansetOperationMultiple = errors.New("spanset operators are not supported for multiple spansets per trace. consider using coalesce()")
@@ -392,7 +393,7 @@ func (o *BinaryOperation) execute(span Span) (Static, error) {
 			return NewStaticBool(strings.Compare(lhs.String(), rhs.String()) <= 0), nil
 		case OpRegex:
 			if o.compiledExpression == nil {
-				o.compiledExpression, err = regexp.Compile(rhs.EncodeToString(false))
+				o.compiledExpression, err = regexp.NewRegexp([]string{rhs.EncodeToString(false)}, true)
 				if err != nil {
 					return NewStaticNil(), err
 				}
@@ -402,14 +403,14 @@ func (o *BinaryOperation) execute(span Span) (Static, error) {
 			return NewStaticBool(matched), err
 		case OpNotRegex:
 			if o.compiledExpression == nil {
-				o.compiledExpression, err = regexp.Compile(rhs.EncodeToString(false))
+				o.compiledExpression, err = regexp.NewRegexp([]string{rhs.EncodeToString(false)}, false)
 				if err != nil {
 					return NewStaticNil(), err
 				}
 			}
 
 			matched := o.compiledExpression.MatchString(lhs.EncodeToString(false))
-			return NewStaticBool(!matched), err
+			return NewStaticBool(matched), err
 		default:
 		}
 	}

diff --git a/pkg/traceql/ast_execute_test.go b/pkg/traceql/ast_execute_test.go
@@ -575,7 +575,7 @@ func TestSpansetOperationEvaluateArray(t *testing.T) {
 			},
 		},
 		{
-			"{ .foo =~ `ba` }", // match string array with regex
+			"{ .foo =~ `ba.*` }", // match string array with regex
 			[]*Spanset{
 				{Spans: []Span{
 					&mockSpan{id: []byte{1}, attributes: map[Attribute]Static{NewAttribute("foo"): NewStaticStringArray([]string{"bar", "baz"})}},
@@ -591,7 +591,7 @@ func TestSpansetOperationEvaluateArray(t *testing.T) {
 			},
 		},
 		{
-			"{ .foo !~ `ba` }", // regex non-matching
+			"{ .foo !~ `ba.*` }", // regex non-matching
 			[]*Spanset{
 				{Spans: []Span{
 					&mockSpan{id: []byte{1}, attributes: map[Attribute]Static{NewAttribute("foo"): NewStaticStringArray([]string{"foo", "baz"})}},
@@ -953,7 +953,7 @@ func TestSpansetOperationEvaluateArraySymmetric(t *testing.T) {
 			},
 		},
 		{
-			"{ `ba` =~ .foo }", // Symmetric regex match for string array
+			"{ `ba.*` =~ .foo }", // Symmetric regex match for string array
 			[]*Spanset{
 				{Spans: []Span{
 					&mockSpan{id: []byte{1}, attributes: map[Attribute]Static{NewAttribute("foo"): NewStaticStringArray([]string{"bar", "baz"})}},
@@ -969,7 +969,7 @@ func TestSpansetOperationEvaluateArraySymmetric(t *testing.T) {
 			},
 		},
 		{
-			"{ `ba` !~ .foo }", // Symmetric regex non-match for string array
+			"{ `ba.*` !~ .foo }", // Symmetric regex non-match for string array
 			[]*Spanset{
 				{Spans: []Span{
 					&mockSpan{id: []byte{1}, attributes: map[Attribute]Static{NewAttribute("foo"): NewStaticStringArray([]string{"foo", "baz"})}},