-
Notifications
You must be signed in to change notification settings - Fork 95
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add LIKE operator support #241
Changes from all commits
5080fc7
95e675d
62a258b
dfec2bd
968cc5c
be621af
1769b49
e5b9f68
7968a87
30c1bd0
ce37a42
ddcd088
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
package expr | ||
|
||
import ( | ||
"errors" | ||
"fmt" | ||
"github.com/genjidb/genji/document" | ||
"github.com/genjidb/genji/sql/query/glob" | ||
"github.com/genjidb/genji/sql/scanner" | ||
) | ||
|
||
func like(pattern, text string) bool { | ||
return glob.MatchLike(pattern, text) | ||
} | ||
|
||
type likeOp struct { | ||
*simpleOperator | ||
} | ||
|
||
// Like creates an expression that evaluates to the result of a LIKE b. | ||
func Like(a, b Expr) Expr { | ||
return &likeOp{&simpleOperator{a, b, scanner.LIKE}} | ||
} | ||
|
||
func (op likeOp) Eval(ctx EvalStack) (document.Value, error) { | ||
a, b, err := op.simpleOperator.eval(ctx) | ||
if err != nil { | ||
return nullLitteral, err | ||
} | ||
|
||
if a.Type != document.TextValue || b.Type != document.TextValue { | ||
return nullLitteral, errors.New("LIKE operator takes a text") | ||
} | ||
|
||
if like(b.V.(string), a.V.(string)) { | ||
return trueLitteral, nil | ||
} | ||
|
||
return falseLitteral, nil | ||
} | ||
|
||
func (op likeOp) String() string { | ||
return fmt.Sprintf("%v LIKE %v", op.a, op.b) | ||
} | ||
|
||
type notLikeOp struct { | ||
likeOp | ||
} | ||
|
||
// NotLike creates an expression that evaluates to the result of a NOT LIKE b. | ||
func NotLike(a, b Expr) Expr { | ||
return ¬LikeOp{likeOp{&simpleOperator{a, b, scanner.LIKE}}} | ||
} | ||
|
||
func (op notLikeOp) Eval(ctx EvalStack) (document.Value, error) { | ||
return invertBoolResult(op.likeOp.Eval)(ctx) | ||
} | ||
|
||
func (op notLikeOp) String() string { | ||
return fmt.Sprintf("%v NOT LIKE %v", op.a, op.b) | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
// Package glob implements wildcard pattern matching algorithms for strings. | ||
// | ||
package glob |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,194 @@ | ||
// The author disclaims copyright to this source code. In place of | ||
// a legal notice, here is a blessing: | ||
// | ||
// May you do good and not evil. | ||
// May you find forgiveness for yourself and forgive others. | ||
// May you share freely, never taking more than you give. | ||
// | ||
// This is an optimized Go port of the SQLite’s icuLikeCompare routine using backtracking. | ||
// See https://sqlite.org/src/file?name=ext%2Ficu%2Ficu.c&ln=117-195&ci=54b54f02c66c5aea | ||
|
||
package glob | ||
|
||
import ( | ||
"unicode" | ||
"unicode/utf8" | ||
) | ||
|
||
const ( | ||
matchOne = '_' | ||
matchAll = '%' | ||
matchEsc = '\\' | ||
) | ||
|
||
// readRune is like skipRune, but also returns the removed Unicode code point. | ||
func readRune(s string) (rune, string) { | ||
r, size := utf8.DecodeRuneInString(s) | ||
if r == utf8.RuneError && size == 1 { | ||
return rune(s[0]), s[1:] | ||
} | ||
return r, s[size:] | ||
} | ||
|
||
// skipRune returns a slice of the string s with the first Unicode code point removed. | ||
func skipRune(s string) string { | ||
_, size := utf8.DecodeRuneInString(s) | ||
return s[size:] | ||
} | ||
|
||
// equalFold is strings.EqualFold for individual runes. | ||
func equalFold(sr, tr rune) bool { | ||
// Easy case. | ||
if tr == sr { | ||
return true | ||
} | ||
|
||
// Make sr < tr to simplify what follows. | ||
if tr < sr { | ||
tr, sr = sr, tr | ||
} | ||
// Fast check for ASCII. | ||
if tr < utf8.RuneSelf { | ||
// ASCII only, sr/tr must be upper/lower case | ||
return 'A' <= sr && sr <= 'Z' && tr == sr+'a'-'A' | ||
} | ||
|
||
// General case. SimpleFold(x) returns the next equivalent rune > x | ||
// or wraps around to smaller values. | ||
r := unicode.SimpleFold(sr) | ||
for r != sr && r < tr { | ||
r = unicode.SimpleFold(r) | ||
} | ||
return r == tr | ||
} | ||
|
||
// MatchLike reports whether string s matches the SQL LIKE-style glob pattern. | ||
// Supported wildcards are '_' (match any one character) and '%' (match zero | ||
// or more characters). They can be escaped by '\' (escape character). | ||
// | ||
// MatchLike requires pattern to match whole string, not just a substring. | ||
func MatchLike(pattern, s string) bool { | ||
var prevEscape bool | ||
|
||
var w, t string // backtracking state | ||
|
||
for len(s) != 0 { | ||
// Read (and consume) the next character from the input pattern. | ||
var p rune | ||
if len(pattern) == 0 { | ||
goto backtrack | ||
} | ||
p, pattern = readRune(pattern) | ||
|
||
loop: | ||
// There are now 4 possibilities: | ||
// | ||
// 1. p is an unescaped matchAll character “%”, | ||
// 2. p is an unescaped matchOne character “_”, | ||
// 3. p is an unescaped matchEsc character, or | ||
// 4. p is to be handled as an ordinary character | ||
// | ||
if p == matchAll && !prevEscape { | ||
// Case 1. | ||
var c byte | ||
|
||
// Skip any matchAll or matchOne characters that follow a | ||
// matchAll. For each matchOne, skip one character in the | ||
// test string. | ||
// | ||
for len(pattern) != 0 { | ||
c = pattern[0] | ||
if c != matchAll && c != matchOne { | ||
break | ||
} | ||
pattern = pattern[1:] | ||
|
||
if c != matchOne { | ||
continue | ||
} | ||
if len(s) == 0 { | ||
return false | ||
} | ||
s = skipRune(s) | ||
} | ||
|
||
if len(pattern) == 0 { | ||
return true | ||
} | ||
|
||
// Save state and match next character. | ||
// | ||
// Since we save t = s and then continue to loop for len(s) ≠ 0, | ||
// the condition len(t) ≠ 0 is always true when we need to backtrack. | ||
// | ||
w, t = pattern, s | ||
} else if p == matchOne && !prevEscape { | ||
// Case 2. | ||
// | ||
// We can either enter loop on normal iteration where len(s) ≠ 0, | ||
// or from backtracking. But we consume all matchOne characters | ||
// before saving backtracking state, so this case is reachable on | ||
// normal iteration only. | ||
// | ||
// That is, we are guaranteed to have input at this point. | ||
// | ||
s = skipRune(s) | ||
} else if p == matchEsc && !prevEscape { | ||
// Case 3. | ||
// | ||
// We can’t reach this case from backtracking to matchAll. | ||
// That implies len(s) ≠ 0 and normal iteration on continue. | ||
// We would either have an escaped character in the pattern, | ||
// or we’ve consumed whole pattern and attempt to backtrack. | ||
// If we can’t backtrack then we are not at the end of input | ||
// since len(s) ≠ 0, and false is returned. That said, it’s | ||
// impossible to exit the loop with truthy prevEscape. | ||
// | ||
prevEscape = true | ||
} else { | ||
// Case 4. | ||
prevEscape = false | ||
|
||
var r rune | ||
r, s = readRune(s) | ||
if !equalFold(p, r) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Shouldn't we use direct comparison? I ran this query on different DBs. SELECT 'abc' LIKE 'ABC'; Results:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we should stick with what MySQL and SQLite do. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In fact, ideally we should be comparing grapheme clusters using e.g. github.com/clipperhouse/uax29/graphemes and golang.org/x/text/collate. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How about we work on collation support in a separate PR? This one is already pretty big and since Genji is not stable yet we can give ourselves time to improve before locking things up. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. SQLite performs (simple) case folding for character comparison with A proper Unicode support would definitely take some time to implement given the current state of Unicode support in Go (scattered across third-party libraries with different Unicode versions, and each embeds their own character database copy). |
||
goto backtrack | ||
} | ||
} | ||
continue | ||
|
||
backtrack: | ||
// If we can’t backtrack return prevEscape | ||
// to allow escaping end of input. | ||
// | ||
if len(w) == 0 { | ||
return prevEscape && len(s) == 0 | ||
} | ||
|
||
// Keep the pattern and skip rune in input. | ||
// Note that we only backtrack to matchAll. | ||
// | ||
p, pattern = matchAll, w | ||
prevEscape = false | ||
s = skipRune(t) | ||
|
||
goto loop | ||
} | ||
|
||
// Check that the rest of the pattern is matchAll. | ||
for i := 0; i < len(pattern); i++ { | ||
if pattern[i] == matchAll { | ||
continue | ||
} | ||
|
||
// Allow escaping end of string. | ||
if i+1 == len(pattern) { | ||
if pattern[i] == matchEsc { | ||
return true | ||
} | ||
} | ||
|
||
return false | ||
} | ||
return true | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for quoting the source 🙏🏼