SQL Parser Performance (#731)

* eliminate channels from lexer * refactor parser for no-channels lexer * refactor misc tests for no-channels lexer Signed-off-by: James Ranson <[email protected]>
trickstercache · Aug 13, 2024 · 65513a4 · 65513a4
1 parent 6ef21df
commit 65513a4
Show file tree

Hide file tree

Showing 15 changed files with 172 additions and 137 deletions.
diff --git a/pkg/backends/clickhouse/parsing_test.go b/pkg/backends/clickhouse/parsing_test.go
@@ -162,49 +162,51 @@ func TestParseErrors(t *testing.T) {
 
 func TestAtWith(t *testing.T) {
 
-	rs := parsing.NewRunState(context.Background())
-	ch := rs.Tokens()
-	ch <- &token.Token{Typ: token.Space, Val: " "}
+	tk := token.Tokens{&token.Token{Typ: token.Space, Val: " "}}
+	rs := parsing.NewRunState(context.Background(), tk)
 	rs.Next()
 	f := atWith(nil, nil, rs)
 	if f != nil {
 		t.Error("expected nil StateFn")
 	}
 
-	rs = parsing.NewRunState(context.Background())
-	ch = rs.Tokens()
-	ch <- &token.Token{Typ: lsql.TokenWith, Val: "with"}
+	tk = token.Tokens{&token.Token{Typ: lsql.TokenWith, Val: "with"}}
+	rs = parsing.NewRunState(context.Background(), tk)
 	rs.Next()
 	atWith(nil, nil, rs)
 	if rs.Error() != parsing.ErrUnsupportedParser {
 		t.Error("expected ErrUnsupportedParser")
 	}
 
-	rs = parsing.NewRunState(context.Background())
-	ch = rs.Tokens()
-	ch <- &token.Token{Typ: lsql.TokenWith, Val: "with"}
-	ch <- &token.Token{Typ: lsql.TokenSelect, Val: "select"}
+	tk = token.Tokens{
+		&token.Token{Typ: lsql.TokenWith, Val: "with"},
+		&token.Token{Typ: lsql.TokenSelect, Val: "select"},
+	}
+
+	rs = parsing.NewRunState(context.Background(), tk)
 	rs.Next()
 	f = atWith(parser, parser, rs)
 	if f == nil {
 		t.Error("expected non-nil StateFn")
 	}
 
-	rs = parsing.NewRunState(context.Background())
-	ch = rs.Tokens()
-	ch <- &token.Token{Typ: lsql.TokenWith, Val: "with"}
-	ch <- &token.Token{Typ: token.EOF}
+	tk = token.Tokens{
+		&token.Token{Typ: lsql.TokenWith, Val: "with"},
+		&token.Token{Typ: token.EOF},
+	}
+	rs = parsing.NewRunState(context.Background(), tk)
 	rs.Next()
 	f = atWith(parser, parser, rs)
 	if f != nil {
 		t.Error("expected nil StateFn")
 	}
 
-	rs = parsing.NewRunState(context.Background())
-	ch = rs.Tokens()
-	ch <- &token.Token{Typ: lsql.TokenWith, Val: "with"}
-	ch <- &token.Token{Typ: token.Identifier, Val: "x"}
-	ch <- &token.Token{Typ: lsql.TokenSelect, Val: "select"}
+	tk = token.Tokens{
+		&token.Token{Typ: lsql.TokenWith, Val: "with"},
+		&token.Token{Typ: token.Identifier, Val: "x"},
+		&token.Token{Typ: lsql.TokenSelect, Val: "select"},
+	}
+	rs = parsing.NewRunState(context.Background(), tk)
 	rs.Next()
 	f = atWith(parser, parser, rs)
 	if f != nil {
@@ -216,7 +218,7 @@ func TestAtWith(t *testing.T) {
 }
 
 func TestAtPreWhere(t *testing.T) {
-	rs := parsing.NewRunState(context.Background())
+	rs := parsing.NewRunState(context.Background(), nil)
 	f := atPreWhere(nil, nil, rs)
 	if f != nil {
 		t.Error("expected nil StateFn")
@@ -227,19 +229,22 @@ func TestAtPreWhere(t *testing.T) {
 }
 
 func TestAtFormat(t *testing.T) {
-	rs := parsing.NewRunState(context.Background())
-	ch := rs.Tokens()
-	ch <- &token.Token{Typ: lsql.TokenComment}
-	ch <- &token.Token{Typ: token.EOF}
+
+	tk := token.Tokens{
+		&token.Token{Typ: lsql.TokenComment},
+		&token.Token{Typ: token.EOF},
+	}
+	rs := parsing.NewRunState(context.Background(), tk)
 	f := atFormat(nil, nil, rs)
 	if f != nil {
 		t.Error("expected nil StateFn")
 	}
 
-	rs = parsing.NewRunState(context.Background())
-	ch = rs.Tokens()
-	ch <- &token.Token{Typ: token.Identifier, Val: "UnsupportedFormat"}
-	ch <- &token.Token{Typ: token.EOF}
+	tk = token.Tokens{
+		&token.Token{Typ: token.Identifier, Val: "UnsupportedFormat"},
+		&token.Token{Typ: token.EOF},
+	}
+	rs = parsing.NewRunState(context.Background(), tk)
 	f = atFormat(nil, nil, rs)
 	if f != nil {
 		t.Error("expected nil StateFn")

diff --git a/pkg/parsing/lex/lex.go b/pkg/parsing/lex/lex.go
@@ -27,7 +27,7 @@ const EOF = -1
 
 // Lexer is the Lexer interface
 type Lexer interface {
-	Run(string, chan *token.Token)
+	Run(string) token.Tokens
 }
 
 // Options provides members that alter the behavior of the underlying Lexer

diff --git a/pkg/parsing/lex/run_state.go b/pkg/parsing/lex/run_state.go
@@ -24,16 +24,16 @@ import (
 	"github.com/trickstercache/trickster/v2/pkg/parsing/token"
 )
 
-// RunState contains all the information about a particular run
+// RunState contains all the information about a particular lexer run
 type RunState struct {
-	Input        string            // the string being scanned
-	InputLowered string            // the lowercase version of the string being scanned
-	InputWidth   int               // width of the input (so we don't have to keep calling len)
-	Pos          int               // current position in the input
-	Start        int               // start position of this Token
-	Width        int               // width of last rune read from input
-	Tokens       chan *token.Token // channel of scanned items
-	ParenDepth   int               // nesting depth of ( ) exprs
+	Input        string // the string being scanned
+	InputLowered string // the lowercase version of the string being scanned
+	InputWidth   int    // width of the input (to avoid multiple calls to len)
+	Pos          int    // current position in the input
+	Start        int    // start position of this Token
+	Width        int    // width of last rune read from input
+	ParenDepth   int    // nesting depth of ( ) exprs
+	Tokens       token.Tokens
 }
 
 // Next returns the next rune in the input.
@@ -62,15 +62,16 @@ func (rs *RunState) Backup() {
 
 // Emit passes an Token back to the client.
 func (rs *RunState) Emit(t token.Typ) {
-	rs.Tokens <- &token.Token{Typ: t, Pos: rs.Start, Val: rs.InputLowered[rs.Start:rs.Pos]}
+	rs.Tokens = append(rs.Tokens, &token.Token{Typ: t, Pos: rs.Start,
+		Val: rs.InputLowered[rs.Start:rs.Pos]})
 	rs.Start = rs.Pos
 }
 
 // EmitToken passes a pre-built an Token back to the client. Returning nil allows the
 // function output to be used as a value when needd to consolidate EmitToken and the
 // likely subsequent return nil
 func (rs *RunState) EmitToken(i *token.Token) StateFn {
-	rs.Tokens <- i
+	rs.Tokens = append(rs.Tokens, i)
 	rs.Start = rs.Pos
 	return nil
 }

diff --git a/pkg/parsing/lex/run_state_test.go b/pkg/parsing/lex/run_state_test.go
@@ -26,7 +26,7 @@ var testRunState = &RunState{
 	Input:        "Test",
 	InputLowered: "test",
 	InputWidth:   4,
-	Tokens:       make(chan *token.Token, 8),
+	Tokens:       make(token.Tokens, 0, 1),
 }
 
 func TestRunState(t *testing.T) {
@@ -88,7 +88,7 @@ func TestScanNumber(t *testing.T) {
 		Input:        "1234",
 		InputLowered: "1234",
 		InputWidth:   4,
-		Tokens:       make(chan *token.Token, 8),
+		Tokens:       make(token.Tokens, 0, 1),
 	}
 	b := tr.ScanNumber()
 	if !b {
@@ -167,7 +167,7 @@ func TestAtTeminator(t *testing.T) {
 		Input:        " ",
 		InputLowered: " ",
 		InputWidth:   1,
-		Tokens:       make(chan *token.Token, 8),
+		Tokens:       make(token.Tokens, 0, 1),
 	}
 	if !tr.AtTerminator() {
 		t.Error("expected true")
@@ -179,7 +179,7 @@ func TestErrorf(t *testing.T) {
 		Input:        " ",
 		InputLowered: " ",
 		InputWidth:   1,
-		Tokens:       make(chan *token.Token, 8),
+		Tokens:       make(token.Tokens, 0, 1),
 	}
 	tk := tr.Errorf("%s", "fail")
 	if tk.Val != "fail" {

diff --git a/pkg/parsing/lex/sql/sql.go b/pkg/parsing/lex/sql/sql.go
@@ -77,24 +77,25 @@ func BaseKey() map[string]token.Typ {
 
 // Run runs the lexer against the provided string and returns tokens on the
 // provided channel. Run will end when the state is EOF or Error.
-func (l *sqllexer) Run(input string, ch chan *token.Token) {
+func (l *sqllexer) Run(input string) token.Tokens {
+	li := len(input)
 	rc := &lex.RunState{
 		Input:        input,
 		InputLowered: strings.ToLower(input),
-		InputWidth:   len(input),
-		Tokens:       ch,
+		InputWidth:   li,
+		Tokens:       make(token.Tokens, 0, li/4), // estimated # of tokens
 	}
 	for state := lexText; state != nil; {
 		state = state(l, rc)
 	}
-	close(ch)
+	return rc.Tokens
 }
 
 // state functions
 
 // lexEOLComment scans a // comment that terminates at the end of the line
 // it assumes you have already identified '//' and are positioned on the first slash
-func lexEOLComment(li lex.Lexer, rs *lex.RunState) lex.StateFn {
+func lexEOLComment(_ lex.Lexer, rs *lex.RunState) lex.StateFn {
 	rs.Pos += 2
 	i := strings.Index(rs.InputLowered[rs.Pos:], "\n")
 	if i == -1 {
@@ -112,7 +113,7 @@ func lexEOLComment(li lex.Lexer, rs *lex.RunState) lex.StateFn {
 }
 
 // lexComment scans a comment. The left comment marker is known to be present.
-func lexComment(li lex.Lexer, rs *lex.RunState) lex.StateFn {
+func lexComment(_ lex.Lexer, rs *lex.RunState) lex.StateFn {
 	rs.Pos += len(leftComment)
 	i := strings.Index(rs.InputLowered[rs.Pos:], rightComment)
 	if i < 0 {

diff --git a/pkg/parsing/lex/sql/sql_test.go b/pkg/parsing/lex/sql/sql_test.go
@@ -18,7 +18,6 @@ package sql
 
 import (
 	"fmt"
-	"sync"
 	"testing"
 
 	"github.com/trickstercache/trickster/v2/pkg/parsing/lex"
@@ -30,14 +29,12 @@ type mockParser struct {
 	l   lex.Lexer
 	lo  *lex.Options
 	err string
-	wg  sync.WaitGroup
 }
 
 // this mock parser simply drains the channel to prevent the
 // lexer from blocking on a full channel
-func (p *mockParser) run() {
-	var t *token.Token
-	for ; ; t = <-p.ch {
+func (p *mockParser) run(tokens token.Tokens) {
+	for _, t := range tokens {
 		if t != nil && t.Typ == token.Error {
 			p.err = t.Val
 		}
@@ -46,7 +43,6 @@ func (p *mockParser) run() {
 			break
 		}
 	}
-	p.wg.Done()
 }
 
 func newLexTestHarness(lo *lex.Options) *mockParser {
@@ -140,10 +136,8 @@ func TestLexer(t *testing.T) {
 	for i, test := range tests {
 		t.Run(fmt.Sprintf("%d", i), func(t *testing.T) {
 			p := newLexTestHarness(test.lo)
-			p.wg.Add(1)
-			go p.run()
-			p.l.Run(test.in, p.ch)
-			p.wg.Wait()
+			tokens := p.l.Run(test.in)
+			p.run(tokens)
 			if p.err != test.expected {
 				t.Errorf(`expected "%v" got "%v"`, test.expected, p.err)
 			}
@@ -179,7 +173,7 @@ func TestStateFuncs(t *testing.T) {
 	for i, test := range tests {
 		t.Run(fmt.Sprintf("%d", i), func(t *testing.T) {
 			p := newLexTestHarness(nil)
-			rs := &lex.RunState{Tokens: p.ch}
+			rs := &lex.RunState{}
 			f := test.f(p.l, rs)
 			if (test.expected != nil && f == nil) ||
 				(test.expected == nil && f != nil) {
@@ -193,23 +187,19 @@ func TestStateFuncs(t *testing.T) {
 func TestLexIdentifierError(t *testing.T) {
 	expected := `bad character U+0040 '@'`
 	p := newLexTestHarness(nil)
-	p.wg.Add(1)
-	go p.run()
-	rs := &lex.RunState{Tokens: p.ch, Pos: 2, InputLowered: "@@@@@@", InputWidth: 6}
+	rs := &lex.RunState{Pos: 2, InputLowered: "@@@@@@", InputWidth: 6}
 	f := lexIdentifier
 	for f != nil {
 		f = f(p.l, rs)
 	}
-	p.wg.Wait()
+	p.run(rs.Tokens)
 	if p.err != expected {
 		t.Errorf("expected `%s` got `%s`", expected, p.err)
 	}
 }
 
 func TestFullSQLLex(t *testing.T) {
 	p := newLexTestHarness(nil)
-	p.wg.Add(1)
-	go p.run()
 	stmt := `SELECT t1.x as x, t2.count(*) as cnt FROM test_db.test_table t1
 	INNER JOIN test_db.test_table2 t2 ON
 	t1.id =  t2.secondary_id
@@ -224,8 +214,8 @@ func TestFullSQLLex(t *testing.T) {
 	HAVING cnt > 15 // EOL COMMENT
 	ORDER BY x
 	LIMIT 100 // EOL COMMENT`
-	p.l.Run(stmt, p.ch)
-	p.wg.Wait()
+	tokens := p.l.Run(stmt)
+	p.run(tokens)
 	if p.err != "" {
 		t.Error(p.err)
 	}

diff --git a/pkg/parsing/run_state.go b/pkg/parsing/run_state.go
@@ -24,21 +24,25 @@ import (
 
 // RunState maintains the state of a unique parsing run
 type RunState struct {
-	tokens                   chan *token.Token
+	tokens                   token.Tokens
 	prev, curr, next, lastkw *token.Token
 	err                      error
 	ctx                      context.Context
 	nextOverride             StateFn
-	isPeeked                 bool
 	results                  map[string]interface{}
+	pos                      int
+	cnt                      int
 }
 
 // NewRunState returns a new RunState object for the parser
-func NewRunState(ctx context.Context) *RunState {
+func NewRunState(ctx context.Context, tokens token.Tokens) *RunState {
+	t := tokens.Compress()
 	rs := &RunState{
-		tokens:  make(chan *token.Token, 8),
 		ctx:     ctx,
 		results: make(map[string]interface{}),
+		tokens:  t,
+		cnt:     len(t),
+		pos:     -1,
 	}
 	return rs
 }
@@ -129,9 +133,10 @@ func (rs *RunState) Peek() *token.Token {
 	if rs.curr != nil && rs.curr.Typ == token.EOF {
 		return rs.curr
 	}
-	// this filters nil tokens so the parser is guaranteed to never encounter them
-	for ; rs.next == nil; rs.next = <-rs.tokens {
+	if rs.pos+1 >= rs.cnt {
+		return &token.Token{Typ: token.EOF, Pos: rs.pos}
 	}
+	rs.next = rs.tokens[rs.pos+1]
 	return rs.next
 }
 
@@ -140,17 +145,12 @@ func (rs *RunState) IsPeeked() bool {
 	return rs.next != nil
 }
 
-// Next retrieves the next location by peeking and then advancing
-// the state
+// Next retrieves the next location by peeking and then advancing the state
 func (rs *RunState) Next() *token.Token {
 	rs.Peek()
 	rs.prev = rs.curr
 	rs.curr = rs.next
+	rs.pos += 1
 	rs.next = nil
 	return rs.curr
 }
-
-// Tokens returns the Tokens Channel for the Run
-func (rs *RunState) Tokens() chan *token.Token {
-	return rs.tokens
-}