Skip to content

Commit

Permalink
Use Prometheus fast regexp (#4329)
Browse files Browse the repository at this point in the history
* basic integration

Signed-off-by: Joe Elliott <[email protected]>

* patch tests for new meaning

Signed-off-by: Joe Elliott <[email protected]>

* patch up more tests

Signed-off-by: Joe Elliott <[email protected]>

* add basic tests

Signed-off-by: Joe Elliott <[email protected]>

* changelog + docs

Signed-off-by: Joe Elliott <[email protected]>

* remove benches

Signed-off-by: Joe Elliott <[email protected]>

* Cleaned up + tests

Signed-off-by: Joe Elliott <[email protected]>

* comment

Signed-off-by: Joe Elliott <[email protected]>

* lint

Signed-off-by: Joe Elliott <[email protected]>

* Update docs/sources/tempo/traceql/_index.md

Co-authored-by: Kim Nylander <[email protected]>

* comment

Signed-off-by: Joe Elliott <[email protected]>

---------

Signed-off-by: Joe Elliott <[email protected]>
Co-authored-by: Kim Nylander <[email protected]>
  • Loading branch information
joe-elliott and knylander-grafana authored Nov 20, 2024
1 parent 7073d04 commit b61c3ce
Show file tree
Hide file tree
Showing 9 changed files with 214 additions and 60 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@
* [ENHANCEMENT] Add `invalid_utf8` to reasons spanmetrics will discard spans. [#4293](https://github.com/grafana/tempo/pull/4293) (@zalegrala)
* [ENHANCEMENT] Reduce frontend and querier allocations by dropping HTTP headers early in the pipeline. [#4298](https://github.com/grafana/tempo/pull/4298) (@joe-elliott)
* [ENHANCEMENT] Reduce ingester working set by improving prelloc behavior. [#4344](https://github.com/grafana/tempo/pull/4344) (@joe-elliott)
* [ENHANCEMENT] Use Promtheus fast regexp for TraceQL regular expression matchers. [#4329](https://github.com/grafana/tempo/pull/4329) (@joe-elliott)
**BREAKING CHANGE** All regular expression matchers will now be fully anchored. `span.foo =~ "bar"` will now be evaluated as `span.foo =~ "^bar$"`
* [BUGFIX] Replace hedged requests roundtrips total with a counter. [#4063](https://github.com/grafana/tempo/pull/4063) [#4078](https://github.com/grafana/tempo/pull/4078) (@galalen)
* [BUGFIX] Metrics generators: Correctly drop from the ring before stopping ingestion to reduce drops during a rollout. [#4101](https://github.com/grafana/tempo/pull/4101) (@joe-elliott)
* [BUGFIX] Correctly handle 400 Bad Request and 404 Not Found in gRPC streaming [#4144](https://github.com/grafana/tempo/pull/4144) (@mapno)
Expand Down
3 changes: 2 additions & 1 deletion docs/sources/tempo/traceql/_index.md
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,8 @@ The implemented comparison operators are:
- `=~` (regular expression)
- `!~` (negated regular expression)

TraceQL uses Golang regular expressions. Online regular expression testing sites like https://regex101.com/ are convenient to validate regular expressions used in TraceQL queries.
TraceQL uses Golang regular expressions. Online regular expression testing sites like https://regex101.com/ are convenient to validate regular expressions used in TraceQL queries. All regular expressions
are treated as fully anchored. For example, `span.foo =~ "bar"` is evaluated as `span.foo =~ "^bar$"`.

For example, to find all traces where an `http.status_code` attribute in a span are greater than `400` but less than equal to `500`:

Expand Down
62 changes: 14 additions & 48 deletions pkg/parquetquery/predicates.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@ package parquetquery
import (
"bytes"
"fmt"
"regexp"
"strings"

"github.com/grafana/tempo/pkg/regexp"

pq "github.com/parquet-go/parquet-go"
)

Expand Down Expand Up @@ -88,9 +89,7 @@ func (p *StringInPredicate) KeepPage(pq.Page) bool {
}

type regexPredicate struct {
regs []*regexp.Regexp
matches map[string]bool
shouldMatch bool
matcher *regexp.Regexp
}

var _ Predicate = (*regexPredicate)(nil)
Expand All @@ -108,66 +107,33 @@ func NewRegexNotInPredicate(regs []string) (Predicate, error) {
}

func newRegexPredicate(regs []string, shouldMatch bool) (Predicate, error) {
p := &regexPredicate{
regs: make([]*regexp.Regexp, 0, len(regs)),
matches: make(map[string]bool),
shouldMatch: shouldMatch,
}
for _, reg := range regs {
r, err := regexp.Compile(reg)
if err != nil {
return nil, err
}
p.regs = append(p.regs, r)
m, err := regexp.NewRegexp(regs, shouldMatch)
if err != nil {
return nil, err
}
return p, nil

return &regexPredicate{
matcher: m,
}, nil
}

func (p *regexPredicate) String() string {
var strings string
for _, s := range p.regs {
strings += fmt.Sprintf("%s, ", s.String())
}
return fmt.Sprintf("RegexInPredicate{%s}", strings)
return fmt.Sprintf("RegexPredicate{%s}", p.matcher.String())
}

func (p *regexPredicate) keep(v *pq.Value) bool {
if v.IsNull() {
return false
}

b := v.ByteArray()

// Check uses zero alloc optimization of map[string([]byte)]
if matched, ok := p.matches[string(b)]; ok {
return matched
}

matched := false
for _, r := range p.regs {
if r.Match(b) == p.shouldMatch {
matched = true
break
}
}

// Only alloc the string when updating the map
p.matches[string(b)] = matched

return matched
return p.matcher.Match(v.ByteArray())
}

func (p *regexPredicate) KeepColumnChunk(cc *ColumnChunkHelper) bool {
d := cc.Dictionary()

// Reset match cache on each row group change
// Use exact size of the incoming dictionary
// if present and larger.
count := len(p.matches)
if d != nil && d.Len() > count {
count = d.Len()
}
p.matches = make(map[string]bool, count)
// should we do this?
p.matcher.Reset()

if d != nil {
return keepDictionary(d, p.KeepValue)
Expand Down
120 changes: 120 additions & 0 deletions pkg/regexp/regexp.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
package regexp

import (
"fmt"
"regexp"
"unsafe"

"github.com/prometheus/prometheus/model/labels"
)

// in order to prevent building an enormous map on extremely high cardinality fields we institute a max
// this number is not tuned. on extremely high cardinality fields memoization is wasteful b/c we rarely
// see the same value 2x. this is all overhead. on lower cardinality fields with an expensive regex memoization
// is very effective at speeding up queries.
const maxMemoize = 1000

type Regexp struct {
matchers []*labels.FastRegexMatcher
matches map[string]bool
shouldMatch bool
}

func NewRegexp(regexps []string, shouldMatch bool) (*Regexp, error) {
matchers := make([]*labels.FastRegexMatcher, 0, len(regexps))

for _, r := range regexps {
m, err := labels.NewFastRegexMatcher(r)
if err != nil {
return nil, err
}
matchers = append(matchers, m)
}

// only memoize if there's a unoptimized matcher
// TODO: should we limit memoization to N values?
var matches map[string]bool
for _, m := range matchers {
if shouldMemoize(m) {
matches = make(map[string]bool)
break
}
}

return &Regexp{
matchers: matchers,
matches: matches,
shouldMatch: shouldMatch,
}, nil
}

func (r *Regexp) Match(b []byte) bool {
return r.MatchString(unsafe.String(unsafe.SliceData(b), len(b)))
}

func (r *Regexp) MatchString(s string) bool {
// if we're memoizing check existing matches
if r.matches != nil {
if matched, ok := r.matches[s]; ok {
return matched
}
}

matched := false
for _, m := range r.matchers {
if m.MatchString(s) == r.shouldMatch {
matched = true
break
}
}

if r.matches != nil && len(r.matches) < maxMemoize {
r.matches[s] = matched
}

return matched
}

func (r *Regexp) Reset() {
if r.matches != nil {
clear(r.matches)
}
}

func (r *Regexp) String() string {
var strings string
for _, m := range r.matchers {
strings += fmt.Sprintf("%s, ", m.GetRegexString())
}

return strings
}

// shouldMemoize returns true if we believe that memoizing this regex would be faster
// the evaluating it directly. see thoughts below.
func shouldMemoize(m *labels.FastRegexMatcher) bool {
// matches labels.FastRegexMatcher
type cheatToSeeInternals struct {
reString string
re *regexp.Regexp

setMatches []string
stringMatcher labels.StringMatcher
prefix string
suffix string
contains []string

matchString func(string) bool
}

cheat := (*cheatToSeeInternals)(unsafe.Pointer(m))

// TODO: research and improve this. we're making a guess on whether an optimization will improve the regex
// performance enough that its faster to not memoize. See compileMatchStringFunction() in the labels
// package. maybe there's even a way to do this dynamically?
return cheat.stringMatcher == nil && // a stringMatcher definitively rejects or accepts. if a string matcher is present the regex will never be executed
len(cheat.setMatches) == 0 && // setMatches definitively reject or accept. if len != 0 the regex will never be executed, but perhaps if there are a very large # of setMatches we prefer memoizing anyway?
cheat.prefix == "" && // prefix and suffix _do not_ prevent the regex from executing, but they are quick to evaluate and tend to nicely filter down.
cheat.suffix == "" // perhaps a length requirement would be an improvement? i.e. require a prefix or suffix of at least 3 chars?
// len(cheat.contains) == 0 // in testing, it was faster to memoize with a contains filter. perhaps if the filters are long enough we don't memoize?
}
64 changes: 64 additions & 0 deletions pkg/regexp/regexp_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
package regexp

import (
"testing"

"github.com/prometheus/prometheus/model/labels"
"github.com/stretchr/testify/require"
)

func TestRegexpMatch(t *testing.T) {
r, err := NewRegexp([]string{"^a.*"}, true)
require.NoError(t, err)

require.True(t, r.Match([]byte("abc")))
require.True(t, r.MatchString("abc"))
require.False(t, r.MatchString("xyz"))

r, err = NewRegexp([]string{"^a.*"}, false)
require.NoError(t, err)

require.False(t, r.Match([]byte("abc")))
require.False(t, r.MatchString("abc"))
require.True(t, r.MatchString("xyz"))
}

func TestShouldMemoize(t *testing.T) {
tcs := []struct {
regex string
shouldMemoize bool
}{
{
regex: ".*",
shouldMemoize: false,
},
{
regex: "foo.*",
shouldMemoize: false,
},
{
regex: ".*bar",
shouldMemoize: false,
},
{
regex: "foo|bar",
shouldMemoize: false,
},
{
regex: ".*bar.*", // creates a containsStringMatcher so should not memoize
shouldMemoize: false,
},
{
regex: ".*bar.*foo.*", // calls contains in order
shouldMemoize: true,
},
}

for _, tc := range tcs {
t.Run(tc.regex, func(t *testing.T) {
m, err := labels.NewFastRegexMatcher(tc.regex)
require.NoError(t, err)
require.Equal(t, tc.shouldMemoize, shouldMemoize(m))
})
}
}
2 changes: 1 addition & 1 deletion pkg/traceql/ast.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,11 @@ import (
"fmt"
"hash/fnv"
"math"
"regexp"
"slices"
"time"
"unsafe"

"github.com/grafana/tempo/pkg/regexp"
"github.com/grafana/tempo/pkg/tempopb"
)

Expand Down
9 changes: 5 additions & 4 deletions pkg/traceql/ast_execute.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@ import (
"errors"
"fmt"
"math"
"regexp"
"strings"

"github.com/grafana/tempo/pkg/regexp"
)

var errSpansetOperationMultiple = errors.New("spanset operators are not supported for multiple spansets per trace. consider using coalesce()")
Expand Down Expand Up @@ -392,7 +393,7 @@ func (o *BinaryOperation) execute(span Span) (Static, error) {
return NewStaticBool(strings.Compare(lhs.String(), rhs.String()) <= 0), nil
case OpRegex:
if o.compiledExpression == nil {
o.compiledExpression, err = regexp.Compile(rhs.EncodeToString(false))
o.compiledExpression, err = regexp.NewRegexp([]string{rhs.EncodeToString(false)}, true)
if err != nil {
return NewStaticNil(), err
}
Expand All @@ -402,14 +403,14 @@ func (o *BinaryOperation) execute(span Span) (Static, error) {
return NewStaticBool(matched), err
case OpNotRegex:
if o.compiledExpression == nil {
o.compiledExpression, err = regexp.Compile(rhs.EncodeToString(false))
o.compiledExpression, err = regexp.NewRegexp([]string{rhs.EncodeToString(false)}, false)
if err != nil {
return NewStaticNil(), err
}
}

matched := o.compiledExpression.MatchString(lhs.EncodeToString(false))
return NewStaticBool(!matched), err
return NewStaticBool(matched), err
default:
}
}
Expand Down
8 changes: 4 additions & 4 deletions pkg/traceql/ast_execute_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -575,7 +575,7 @@ func TestSpansetOperationEvaluateArray(t *testing.T) {
},
},
{
"{ .foo =~ `ba` }", // match string array with regex
"{ .foo =~ `ba.*` }", // match string array with regex
[]*Spanset{
{Spans: []Span{
&mockSpan{id: []byte{1}, attributes: map[Attribute]Static{NewAttribute("foo"): NewStaticStringArray([]string{"bar", "baz"})}},
Expand All @@ -591,7 +591,7 @@ func TestSpansetOperationEvaluateArray(t *testing.T) {
},
},
{
"{ .foo !~ `ba` }", // regex non-matching
"{ .foo !~ `ba.*` }", // regex non-matching
[]*Spanset{
{Spans: []Span{
&mockSpan{id: []byte{1}, attributes: map[Attribute]Static{NewAttribute("foo"): NewStaticStringArray([]string{"foo", "baz"})}},
Expand Down Expand Up @@ -953,7 +953,7 @@ func TestSpansetOperationEvaluateArraySymmetric(t *testing.T) {
},
},
{
"{ `ba` =~ .foo }", // Symmetric regex match for string array
"{ `ba.*` =~ .foo }", // Symmetric regex match for string array
[]*Spanset{
{Spans: []Span{
&mockSpan{id: []byte{1}, attributes: map[Attribute]Static{NewAttribute("foo"): NewStaticStringArray([]string{"bar", "baz"})}},
Expand All @@ -969,7 +969,7 @@ func TestSpansetOperationEvaluateArraySymmetric(t *testing.T) {
},
},
{
"{ `ba` !~ .foo }", // Symmetric regex non-match for string array
"{ `ba.*` !~ .foo }", // Symmetric regex non-match for string array
[]*Spanset{
{Spans: []Span{
&mockSpan{id: []byte{1}, attributes: map[Attribute]Static{NewAttribute("foo"): NewStaticStringArray([]string{"foo", "baz"})}},
Expand Down
Loading

0 comments on commit b61c3ce

Please sign in to comment.