diff --git a/rule/sexpr/lexer.go b/rule/sexpr/lexer.go index de01926..d830831 100644 --- a/rule/sexpr/lexer.go +++ b/rule/sexpr/lexer.go @@ -1,7 +1,16 @@ package sexpr -import "unicode" +import ( + "bufio" + "bytes" + "fmt" + "io" + "unicode" + "unicode/utf8" +) +// Tokens are the fundamental identifier of the lexical scanner. +// Every scanned element will be assigned a token type. type Token int const ( @@ -44,7 +53,7 @@ func isNumber(r rune) bool { // Note, although we allow a number to contain a decimal // point, it can't start with one so we don't include that in // the predicate. - return r == '-' || (r >= '0' && r <= '9') + return r == '-' || unicode.IsDigit(r) } // isBool returns true if the rune is the # (hash or octothorpe) @@ -64,3 +73,385 @@ func isComment(r rune) bool { func isSymbol(r rune) bool { return !(isWhitespace(r) || isLParen(r) || isRParen(r) || isString(r) || isNumber(r) || isBool(r) || isComment(r)) } + +// Scanner is a lexical scanner for extracting the lexical tokens from +// a string of characters in our rule symbolic expression language. +type Scanner struct { + r *bufio.Reader + byteCount int + charCount int + lineCount int + lineCharCount int + previousLineCharCount int +} + +// NewScanner wraps a Scanner around the provided io.Reader so that we +// might scan lexical tokens for the rule symbolic expression language +// from it. +func NewScanner(r io.Reader) *Scanner { + return &Scanner{ + r: bufio.NewReader(r), + lineCount: 1, + } +} + +// Scan returns the next lexical token found in the Scanner's io.Reader. +func (s *Scanner) Scan() (Token, string, error) { + rn, err := s.readRune() + if err != nil { + se := err.(*ScanError) + if se.EOF { + return EOF, "", nil + } + return EOF, "", err + } + switch { + case isLParen(rn): + return LPAREN, "(", nil + case isRParen(rn): + return RPAREN, ")", nil + case isWhitespace(rn): + s.unreadRune(rn) + return s.scanWhitespace() + case isString(rn): + return s.scanString() + case isNumber(rn): + s.unreadRune(rn) + return s.scanNumber() + case isBool(rn): + return s.scanBool() + case isComment(rn): + return s.scanComment() + case isSymbol(rn): + s.unreadRune(rn) + return s.scanSymbol() + } + + return EOF, string(rn), s.newScanError("Illegal character scanned") +} + +// readRune pulls the next rune from the input sequence. +func (s *Scanner) readRune() (rune, error) { + rn, size, err := s.r.ReadRune() + // EOF is a special case, it shouldn't affect counts + if err == io.EOF { + return rn, s.eof() + } + // We need to update the counts correctly before considering + // any error, so that the data embedded in the ScanError is + // correct. + s.byteCount += size + s.charCount++ + s.lineCharCount++ + if rn == '\n' { + // DOS/Windows encoding does \n\r for new lines, but + // we can ignore the \r and still get the right + // result. + s.lineCount++ + // Store the previous line char count in case we unread + s.previousLineCharCount = s.lineCharCount + // it's char zero, the next readRune should take us to 1 + s.lineCharCount = 0 + } + if err != nil { + return rn, s.newScanError(err.Error()) + } + return rn, nil +} + +// unreadRune puts the last readRune back on the buffer and resets the +// counters. It requires that the rune to be unread is passed, as we +// need to know the byte size of the rune. +func (s *Scanner) unreadRune(rn rune) { + err := s.r.UnreadRune() + if err != nil { + // This means something truly awful happened! + panic(err.Error()) + } + // Decrement counts after the unread is complete + s.byteCount -= utf8.RuneLen(rn) + s.charCount-- + s.lineCharCount-- + if rn == '\n' { + s.lineCount-- + s.lineCharCount = s.previousLineCharCount + s.previousLineCharCount-- + } + +} + +// scanWhitespace scans a contiguous sequence of whitespace +// characters. Note that this will consume newlines as it goes, +// lexically speaking they're insignificant to the language. +func (s *Scanner) scanWhitespace() (Token, string, error) { + var b bytes.Buffer + for { + rn, err := s.readRune() + if err != nil { + se := err.(*ScanError) + if se.EOF { + // We'll get EOF next time we try to + // read a rune anyway, so we don't + // have to care about it here, which + // simplifies things. + return WHITESPACE, b.String(), nil + + } + return WHITESPACE, b.String(), err + + } + if !isWhitespace(rn) { + s.unreadRune(rn) + break + } + b.WriteRune(rn) + } + return WHITESPACE, b.String(), nil +} + +// scanString returns the contents of single, contiguous, double-quote delimited string constant. +func (s *Scanner) scanString() (Token, string, error) { + var b bytes.Buffer + escape := false + for { + rn, err := s.readRune() + if err != nil { + se := err.(*ScanError) + if se.EOF { + // we reached the end of the file + // without seeing a terminator, that's + // an error. + return STRING, b.String(), s.newScanError("unterminated string constant") + } + return STRING, b.String(), err + } + if escape { + b.WriteRune(rn) + escape = false + continue + } + if isString(rn) { + break + } + escape = rn == '\\' + if !escape { + b.WriteRune(rn) + } + } + return STRING, b.String(), nil +} + +// scanNumber scans a contiguous string representing a number. As we +// have to handle the negative numeric form, it's possible that the +// '-' rune can prefix a number. This is problematic because '-' can +// also be a symbol referring to the arithmetic operation "minus" - +// that confusion is resolved by scanNumber, and should it consider +// the latter case to be true it will return a SYMBOL rather than a +// NUMBER. +func (s *Scanner) scanNumber() (Token, string, error) { + var b bytes.Buffer + + // We can be certain this isn't EOF because we will already + // have read and unread the rune before arriving here. + rn, err := s.readRune() + if err != nil { + // Something drastic happened, because we read this fine the first time. + return NUMBER, "", err + } + + // Whatever happens we'll want the rune. + b.WriteRune(rn) + + // Deal with the first rune. Numbers have special rules about + // the first rune, specifically it, and only it, may be the + // minus symbol. When we loop later any occurrence of '-' will + // be an error. + if rn == '-' { + + // Now we look ahead to see if a number is coming, if + // its anything else then this isn't a negative + // number, but some other form. + rn, err := s.readRune() + + if err != nil { + se := err.(*ScanError) + if se.EOF { + // In reality having '-' as the final + // symbol in a stream is never useful, + // but this is the sort of error we + // should catch in the Parser, not the + // scanner. + return SYMBOL, b.String(), nil + } + return NUMBER, b.String(), err + } + + // We've stored the rune, and we know we'll want to + // unread whatever happens, so lets just do that now. + s.unreadRune(rn) + + // If the next rune isn't a digit then we're going to + // assume this is the minus operator and return '-' as + // a symbol instead of a number. There are still + // cases where this wouldn't be valid, but they're all + // errors and we'll leave that for the Parser to + // handle. + if !unicode.IsDigit(rn) { + return SYMBOL, b.String(), nil + } + } + + // OK, let's scan the rest of the number... + + for { + rn, err := s.readRune() + if err != nil { + se := err.(*ScanError) + if se.EOF { + // EOF is a valid terminator for a number + return NUMBER, b.String(), nil + } + return NUMBER, b.String(), err + } + if rn == '-' { + // As we said before '-' can't appear in the + // body of a number, this is an error. + return NUMBER, b.String(), s.newScanError("invalid number format (minus can only appear at the beginning of a number)") + } + + // Valid number parts are written to the buffer + if isNumber(rn) || rn == '.' { + b.WriteRune(rn) + continue + } + // we hit a terminating character, end the number here. + s.unreadRune(rn) + break + } + return NUMBER, b.String(), nil +} + +// scanBool scans the contiguous characters following the '#' symbol, +// it they are either 'true', or 'false' a BOOL is returned, otherwise +// an ScanError will be returned. +func (s *Scanner) scanBool() (Token, string, error) { + + var b bytes.Buffer + + for { + rn, err := s.readRune() + if err != nil { + se := err.(*ScanError) + if se.EOF { + // EOF is a valid terminator for a boolean + break + } + return BOOL, b.String(), err + } + + // isSymbol is handy shorthand for "it's not anything else" + if !isSymbol(rn) { + s.unreadRune(rn) + break + } + b.WriteRune(rn) + } + + symbol := b.String() + if symbol == "true" || symbol == "false" { + return BOOL, symbol, nil + } + if len(symbol) > 0 { + return BOOL, symbol, s.newScanError(fmt.Sprintf("invalid boolean: %s", symbol)) + } + return BOOL, symbol, s.newScanError("invalid boolean") +} + +// scanComment will scan to the end of the current line, consuming any and all chars prior to '\n'. +func (s *Scanner) scanComment() (Token, string, error) { + var b bytes.Buffer + + for { + rn, err := s.readRune() + if err != nil { + se := err.(*ScanError) + if se.EOF { + // EOF is a valid terminator for a Comment + break + } + return COMMENT, b.String(), err + } + + if rn == '\n' { + break + } + + b.WriteRune(rn) + } + return COMMENT, b.String(), nil +} + +// scanSymbol scans a contiguous block of symbol characters. Any non-symbol character will terminate it. +func (s *Scanner) scanSymbol() (Token, string, error) { + var b bytes.Buffer + + for { + rn, err := s.readRune() + if err != nil { + se := err.(*ScanError) + if se.EOF { + // EOF is a valid terminator for a Symbol + break + } + return SYMBOL, b.String(), err + } + // Again, we have to special case '-', which can't start a symbol, but can appear in it. + // Likewise numbers. + if !(isSymbol(rn) || rn == '-' || unicode.IsDigit(rn)) { + s.unreadRune(rn) + break + } + b.WriteRune(rn) + } + + return SYMBOL, b.String(), nil +} + +// newScanError returns a ScanError initialised with the current +// positional information of the Scanner. +func (s *Scanner) newScanError(message string) *ScanError { + return &ScanError{ + Byte: s.byteCount, + Char: s.charCount, + Line: s.lineCount, + CharInLine: s.lineCharCount, + msg: message, + } +} + +// eof returns a ScanError, initialised with the current positional +// information of the Scanner, and with it's EOF field set to True. +func (s *Scanner) eof() *ScanError { + err := s.newScanError("EOF") + err.EOF = true + return err +} + +// ScanError is a type that implements the Error interface, but adds +// additional context information to errors that can be inspected. It +// is intended to be used for all errors emerging from the Scanner. +type ScanError struct { + Byte int + Char int + Line int + CharInLine int + msg string + EOF bool +} + +// Error makes ScanError comply with the Error interface. It returns +// a string representation of the ScanError including it's message and +// some human readable position information. +func (se ScanError) Error() string { + return fmt.Sprintf("Error:%d,%d: %s", se.Line, se.CharInLine, se.msg) +} diff --git a/rule/sexpr/lexer_internal_test.go b/rule/sexpr/lexer_internal_test.go index e76cdec..f27e420 100644 --- a/rule/sexpr/lexer_internal_test.go +++ b/rule/sexpr/lexer_internal_test.go @@ -1,6 +1,9 @@ package sexpr import ( + "bytes" + "fmt" + "io" "testing" "github.com/stretchr/testify/require" @@ -144,5 +147,206 @@ func TestIsSymbol(t *testing.T) { require.False(t, isSymbol(')')) require.False(t, isSymbol('(')) require.False(t, isSymbol('0')) +} + +// NewScanner wraps an io.Reader +func TestNewScanner(t *testing.T) { + expected := "(+ 1 1)" + b := bytes.NewBufferString(expected) + s := NewScanner(b) + content, err := s.r.ReadString('\n') + require.Error(t, err) + require.Equal(t, io.EOF, err) + require.Equal(t, expected, content) +} + +func assertScannerScanned(t *testing.T, s *Scanner, output string, token Token, byteCount, charCount, lineCount, lineCharCount int) { + tok, lit, err := s.Scan() + require.NoError(t, err) + require.Equalf(t, token, tok, "token") + require.Equalf(t, output, lit, "literal") + require.Equalf(t, byteCount, s.byteCount, "byteCount") + require.Equalf(t, charCount, s.charCount, "charCount") + require.Equalf(t, lineCount, s.lineCount, "lineCount") + require.Equalf(t, lineCharCount, s.lineCharCount, "lineCharCount") +} + +func assertScanned(t *testing.T, input, output string, token Token, byteCount, charCount, lineCount, lineCharCount int) { + t.Run(fmt.Sprintf("Scan %s 0x%x", input, input), func(t *testing.T) { + b := bytes.NewBufferString(input) + s := NewScanner(b) + assertScannerScanned(t, s, output, token, byteCount, charCount, lineCount, lineCharCount) + }) +} + +func assertScannerScanFailed(t *testing.T, s *Scanner, message string) { + _, _, err := s.Scan() + require.EqualError(t, err, message) + +} + +func assertScanFailed(t *testing.T, input, message string) { + t.Run(fmt.Sprintf("Scan should fail %s 0x%x", input, input), func(t *testing.T) { + b := bytes.NewBufferString(input) + s := NewScanner(b) + assertScannerScanFailed(t, s, message) + }) + +} + +func TestScannerScanParenthesis(t *testing.T) { + // Test L Parenthesis + assertScanned(t, "(", "(", LPAREN, 1, 1, 1, 1) + // Test R Parenthesis + assertScanned(t, ")", ")", RPAREN, 1, 1, 1, 1) +} + +func TestScannerScanWhiteSpace(t *testing.T) { + // Test white-space + assertScanned(t, " ", " ", WHITESPACE, 1, 1, 1, 1) + assertScanned(t, "\t", "\t", WHITESPACE, 1, 1, 1, 1) + assertScanned(t, "\r", "\r", WHITESPACE, 1, 1, 1, 1) + assertScanned(t, "\n", "\n", WHITESPACE, 1, 1, 2, 0) + assertScanned(t, "\v", "\v", WHITESPACE, 1, 1, 1, 1) + assertScanned(t, "\f", "\f", WHITESPACE, 1, 1, 1, 1) + // Test contiguous white-space: + // - terminated by EOF + assertScanned(t, " ", " ", WHITESPACE, 2, 2, 1, 2) + // - terminated by non white-space character. + assertScanned(t, " (", " ", WHITESPACE, 2, 2, 1, 2) +} + +func TestScannerScanString(t *testing.T) { + // Test string: + // - the empty string + assertScanned(t, `""`, "", STRING, 2, 2, 1, 2) + // - the happy case + assertScanned(t, `"foo"`, "foo", STRING, 5, 5, 1, 5) + // - an unterminated sad case + assertScanFailed(t, `"foo`, "Error:1,4: unterminated string constant") + // - happy case with escaped double quote + assertScanned(t, `"foo\""`, `foo"`, STRING, 7, 7, 1, 7) + // - sad case with escaped terminator + assertScanFailed(t, `"foo\"`, "Error:1,6: unterminated string constant") +} + +func TestScannerScanNumber(t *testing.T) { + // Test number + // - Single digit integer, EOF terminated + assertScanned(t, "1", "1", NUMBER, 1, 1, 1, 1) + // - Single digit integer, terminated by non-numeric character + assertScanned(t, "1)", "1", NUMBER, 1, 1, 1, 1) + // - Multi-digit integer, EOF terminated + assertScanned(t, "998989", "998989", NUMBER, 6, 6, 1, 6) + // - Negative multi-digit integer, EOF terminated + assertScanned(t, "-100", "-100", NUMBER, 4, 4, 1, 4) + // - Floating point number, EOF terminated + assertScanned(t, "2.4", "2.4", NUMBER, 3, 3, 1, 3) + // - long negative float, terminated by non-numeric character + assertScanned(t, "-123.45456 ", "-123.45456", NUMBER, 10, 10, 1, 10) + // - special case: a "-" without a number following it (as per the minus operator) + assertScanned(t, "- 1 2", "-", SYMBOL, 1, 1, 1, 1) + // - sad case: a minus mid-number + assertScanFailed(t, "1-2", "Error:1,2: invalid number format (minus can only appear at the beginning of a number)") +} + +func TestScannerScanBool(t *testing.T) { + // Happy cases + // - true, EOF Terminated + assertScanned(t, "#true", "true", BOOL, 5, 5, 1, 5) + // - false, newline terminated + assertScanned(t, "#false\n", "false", BOOL, 6, 6, 1, 7) + // Sad cases + // - partial true + assertScanFailed(t, "#tru ", "Error:1,4: invalid boolean: tru") + // - partial false + assertScanFailed(t, "#fa)", "Error:1,3: invalid boolean: fa") + // - invalid + assertScanFailed(t, "#1", "Error:1,1: invalid boolean") + // - repeated signal character + assertScanFailed(t, "##", "Error:1,1: invalid boolean") + // - empty + assertScanFailed(t, "#", "Error:1,1: invalid boolean") +} + +func TestScannerScanComment(t *testing.T) { + // Simple empty comment at EOF + assertScanned(t, ";", "", COMMENT, 1, 1, 1, 1) + // Comment terminated by newline + assertScanned(t, "; Foo\nbar", " Foo", COMMENT, 6, 6, 2, 0) + // Comment containing Comment char + assertScanned(t, ";Pants;On;Fire", "Pants;On;Fire", COMMENT, 14, 14, 1, 14) + // Comment containing control characters + assertScanned(t, `;()"-#1`, `()"-#1`, COMMENT, 7, 7, 1, 7) +} + +func TestScannerScanSymbol(t *testing.T) { + // Simple, single character identifier + assertScanned(t, "a", "a", SYMBOL, 1, 1, 1, 1) + // Fully formed symbol + assertScanned(t, "abba-sucks-123_ok!", "abba-sucks-123_ok!", SYMBOL, 18, 18, 1, 18) + // Unicode in symbols + assertScanned(t, "mötlěy_crü_sucks_more", "mötlěy_crü_sucks_more", SYMBOL, 24, 21, 1, 21) + // terminated by comment + assertScanned(t, "bon;jovi is worse", "bon", SYMBOL, 3, 3, 1, 3) + // terminated by whitespace + assertScanned(t, "van halen is the worst", "van", SYMBOL, 3, 3, 1, 3) + // terminated by control character + assertScanned(t, "NoWayMichaelBolton)IsTheNadir", "NoWayMichaelBolton", SYMBOL, 18, 18, 1, 18) + // symbol starting with a non-alpha character + assertScanned(t, "+", "+", SYMBOL, 1, 1, 1, 1) + // actually handled by the number scan, but we'll check '-' all the same: + assertScanned(t, "-", "-", SYMBOL, 1, 1, 1, 1) +} + +// Scanner.Scan can scan a full symbollic expression sequence. +func TestScannerScanSequence(t *testing.T) { + input := ` +(and + (= (+ 1 -1) 0) + (= my-parameter "fudge sundae")) ; Crazy +` + b := bytes.NewBufferString(input) + s := NewScanner(b) + assertScannerScanned(t, s, "\n", WHITESPACE, 1, 1, 2, 0) + assertScannerScanned(t, s, "(", LPAREN, 2, 2, 2, 1) + assertScannerScanned(t, s, "and", SYMBOL, 5, 5, 2, 5) + assertScannerScanned(t, s, "\n ", WHITESPACE, 8, 8, 3, 2) + assertScannerScanned(t, s, "(", LPAREN, 9, 9, 3, 3) + assertScannerScanned(t, s, "=", SYMBOL, 10, 10, 3, 4) + assertScannerScanned(t, s, " ", WHITESPACE, 11, 11, 3, 5) + assertScannerScanned(t, s, "(", LPAREN, 12, 12, 3, 6) + assertScannerScanned(t, s, "+", SYMBOL, 13, 13, 3, 7) + assertScannerScanned(t, s, " ", WHITESPACE, 14, 14, 3, 8) + assertScannerScanned(t, s, "1", NUMBER, 15, 15, 3, 9) + assertScannerScanned(t, s, " ", WHITESPACE, 16, 16, 3, 10) + assertScannerScanned(t, s, "-1", NUMBER, 18, 18, 3, 12) + assertScannerScanned(t, s, ")", RPAREN, 19, 19, 3, 13) + assertScannerScanned(t, s, " ", WHITESPACE, 20, 20, 3, 14) + assertScannerScanned(t, s, "0", NUMBER, 21, 21, 3, 15) + assertScannerScanned(t, s, ")", RPAREN, 22, 22, 3, 16) + assertScannerScanned(t, s, "\n ", WHITESPACE, 25, 25, 4, 2) + assertScannerScanned(t, s, "(", LPAREN, 26, 26, 4, 3) + assertScannerScanned(t, s, "=", SYMBOL, 27, 27, 4, 4) + assertScannerScanned(t, s, " ", WHITESPACE, 28, 28, 4, 5) + assertScannerScanned(t, s, "my-parameter", SYMBOL, 40, 40, 4, 17) + assertScannerScanned(t, s, " ", WHITESPACE, 41, 41, 4, 18) + assertScannerScanned(t, s, "fudge sundae", STRING, 55, 55, 4, 32) + assertScannerScanned(t, s, ")", RPAREN, 56, 56, 4, 33) + assertScannerScanned(t, s, ")", RPAREN, 57, 57, 4, 34) + assertScannerScanned(t, s, " ", WHITESPACE, 58, 58, 4, 35) + assertScannerScanned(t, s, " Crazy", COMMENT, 66, 66, 5, 0) + assertScannerScanned(t, s, "", EOF, 66, 66, 5, 0) +} +func TestScannerScanReturnsScanError(t *testing.T) { + input := ` +(= "toffee` + b := bytes.NewBufferString(input) + s := NewScanner(b) + assertScannerScanned(t, s, "\n", WHITESPACE, 1, 1, 2, 0) + assertScannerScanned(t, s, "(", LPAREN, 2, 2, 2, 1) + assertScannerScanned(t, s, "=", SYMBOL, 3, 3, 2, 2) + assertScannerScanned(t, s, " ", WHITESPACE, 4, 4, 2, 3) + assertScannerScanFailed(t, s, "Error:2,10: unterminated string constant") }