From a424fdcd602d3efe99a03e420514d8e4554d9473 Mon Sep 17 00:00:00 2001 From: "Geoffrey J. Teale" Date: Mon, 5 Nov 2018 16:10:11 +0100 Subject: [PATCH 01/24] Define NewScanner and Scanner type --- rule/sexpr/lexer.go | 19 ++++++++++++++++++- rule/sexpr/lexer_internal_test.go | 12 ++++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/rule/sexpr/lexer.go b/rule/sexpr/lexer.go index de01926..927992d 100644 --- a/rule/sexpr/lexer.go +++ b/rule/sexpr/lexer.go @@ -1,6 +1,10 @@ package sexpr -import "unicode" +import ( + "bufio" + "io" + "unicode" +) type Token int @@ -64,3 +68,16 @@ func isComment(r rune) bool { func isSymbol(r rune) bool { return !(isWhitespace(r) || isLParen(r) || isRParen(r) || isString(r) || isNumber(r) || isBool(r) || isComment(r)) } + +// Scanner is a lexical scanner for extracting the lexical tokens from +// a string of characters in our rule symbolic expression language. +type Scanner struct { + r *bufio.Reader +} + +// NewScanner wraps a Scanner around the provided io.Reader so that we +// might scan lexical tokens for the rule symbolic expression language +// from it. +func NewScanner(r io.Reader) *Scanner { + return &Scanner{r: bufio.NewReader(r)} +} diff --git a/rule/sexpr/lexer_internal_test.go b/rule/sexpr/lexer_internal_test.go index e76cdec..d441f41 100644 --- a/rule/sexpr/lexer_internal_test.go +++ b/rule/sexpr/lexer_internal_test.go @@ -1,6 +1,8 @@ package sexpr import ( + "bytes" + "io" "testing" "github.com/stretchr/testify/require" @@ -144,5 +146,15 @@ func TestIsSymbol(t *testing.T) { require.False(t, isSymbol(')')) require.False(t, isSymbol('(')) require.False(t, isSymbol('0')) +} +// NewScanner wraps an io.Reader +func TestNewScanner(t *testing.T) { + expected := "(+ 1 1)" + b := bytes.NewBufferString(expected) + s := NewScanner(b) + content, err := s.r.ReadString('\n') + require.Error(t, err) + require.Equal(t, io.EOF, err) + require.Equal(t, expected, content) } From 382dd5df415fb54c3fbfcd358960283174c1bedc Mon Sep 17 00:00:00 2001 From: "Geoffrey J. Teale" Date: Mon, 5 Nov 2018 16:58:28 +0100 Subject: [PATCH 02/24] Scanner can scan for LPAREN --- rule/sexpr/lexer.go | 61 ++++++++++++++++++++++++++++++- rule/sexpr/lexer_internal_test.go | 11 ++++++ 2 files changed, 71 insertions(+), 1 deletion(-) diff --git a/rule/sexpr/lexer.go b/rule/sexpr/lexer.go index 927992d..5bb45f6 100644 --- a/rule/sexpr/lexer.go +++ b/rule/sexpr/lexer.go @@ -72,7 +72,11 @@ func isSymbol(r rune) bool { // Scanner is a lexical scanner for extracting the lexical tokens from // a string of characters in our rule symbolic expression language. type Scanner struct { - r *bufio.Reader + r *bufio.Reader + byteCount int + charCount int + lineCount int + lineCharCount int } // NewScanner wraps a Scanner around the provided io.Reader so that we @@ -81,3 +85,58 @@ type Scanner struct { func NewScanner(r io.Reader) *Scanner { return &Scanner{r: bufio.NewReader(r)} } + +// Scan returns the next lexical token found in the Scanner's io.Reader. +func (s *Scanner) Scan() (Token, string, error) { + rn, err := s.readRune() + if err != nil { + return EOF, "", s.newScanError(err.Error()) + } + switch { + case isLParen(rn): + return LPAREN, "(", nil + } + return EOF, string(rn), s.newScanError("Illegal character scanned") +} + +// +func (s *Scanner) readRune() (rune, error) { + rn, size, err := s.r.ReadRune() + s.byteCount += size + s.charCount++ + s.lineCharCount++ + if rn == '\n' { + // DOS/Windows encoding does \n\r for new lines, but + // we can ignore the \r and still get the right + // result. + s.lineCount++ + // it's char zero, the next readRune should take us to 1 + s.lineCharCount = 0 + } + return rn, err + +} + +// +func (s *Scanner) newScanError(message string) *ScanError { + return &ScanError{ + Byte: s.byteCount, + Char: s.charCount, + Line: s.lineCount + 1, + CharInLine: s.lineCharCount, + msg: message, + } +} + +type ScanError struct { + Byte int + Char int + Line int + CharInLine int + msg string +} + +// +func (se *ScanError) Error() string { + return se.msg +} diff --git a/rule/sexpr/lexer_internal_test.go b/rule/sexpr/lexer_internal_test.go index d441f41..67dbe0a 100644 --- a/rule/sexpr/lexer_internal_test.go +++ b/rule/sexpr/lexer_internal_test.go @@ -158,3 +158,14 @@ func TestNewScanner(t *testing.T) { require.Equal(t, io.EOF, err) require.Equal(t, expected, content) } + +func TestScannerScan(t *testing.T) { + expected := "(" + b := bytes.NewBufferString(expected) + s := NewScanner(b) + tok, lit, err := s.Scan() + require.NoError(t, err) + require.Equal(t, tok, LPAREN) + require.Equal(t, lit, "(") + +} From 97f092b948f18c3cc4fce8561acc353abcb43915 Mon Sep 17 00:00:00 2001 From: Geoffrey John Teale Date: Wed, 7 Nov 2018 09:05:29 +0100 Subject: [PATCH 03/24] Refactor TestScannerScan --- rule/sexpr/lexer_internal_test.go | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/rule/sexpr/lexer_internal_test.go b/rule/sexpr/lexer_internal_test.go index 67dbe0a..dc11ec2 100644 --- a/rule/sexpr/lexer_internal_test.go +++ b/rule/sexpr/lexer_internal_test.go @@ -2,6 +2,7 @@ package sexpr import ( "bytes" + "fmt" "io" "testing" @@ -159,13 +160,17 @@ func TestNewScanner(t *testing.T) { require.Equal(t, expected, content) } -func TestScannerScan(t *testing.T) { - expected := "(" - b := bytes.NewBufferString(expected) - s := NewScanner(b) - tok, lit, err := s.Scan() - require.NoError(t, err) - require.Equal(t, tok, LPAREN) - require.Equal(t, lit, "(") +func assertScanned(t *testing.T, input string, token Token) { + t.Run(fmt.Sprintf("Scan %s", input), func(t *testing.T) { + b := bytes.NewBufferString(input) + s := NewScanner(b) + tok, lit, err := s.Scan() + require.NoError(t, err) + require.Equal(t, token, tok) + require.Equal(t, input, lit) + }) +} +func TestScannerScan(t *testing.T) { + assertScanned(t, "(", LPAREN) } From abd54c264adaa73994f20bdd1726d4dd02e41c98 Mon Sep 17 00:00:00 2001 From: Geoffrey John Teale Date: Wed, 7 Nov 2018 09:10:36 +0100 Subject: [PATCH 04/24] Scan Right Parenthesis --- rule/sexpr/lexer.go | 3 ++- rule/sexpr/lexer_internal_test.go | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/rule/sexpr/lexer.go b/rule/sexpr/lexer.go index 5bb45f6..16abeb7 100644 --- a/rule/sexpr/lexer.go +++ b/rule/sexpr/lexer.go @@ -95,11 +95,12 @@ func (s *Scanner) Scan() (Token, string, error) { switch { case isLParen(rn): return LPAREN, "(", nil + case isRParen(rn): + return RPAREN, ")", nil } return EOF, string(rn), s.newScanError("Illegal character scanned") } -// func (s *Scanner) readRune() (rune, error) { rn, size, err := s.r.ReadRune() s.byteCount += size diff --git a/rule/sexpr/lexer_internal_test.go b/rule/sexpr/lexer_internal_test.go index dc11ec2..37e5af7 100644 --- a/rule/sexpr/lexer_internal_test.go +++ b/rule/sexpr/lexer_internal_test.go @@ -173,4 +173,5 @@ func assertScanned(t *testing.T, input string, token Token) { func TestScannerScan(t *testing.T) { assertScanned(t, "(", LPAREN) + assertScanned(t, ")", RPAREN) } From 1308aedca9351281cca187bd738be9618d32ed2c Mon Sep 17 00:00:00 2001 From: Geoffrey John Teale Date: Wed, 7 Nov 2018 09:19:36 +0100 Subject: [PATCH 05/24] Include line and char-in-line in error message. --- rule/sexpr/lexer.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rule/sexpr/lexer.go b/rule/sexpr/lexer.go index 16abeb7..5f547f7 100644 --- a/rule/sexpr/lexer.go +++ b/rule/sexpr/lexer.go @@ -139,5 +139,5 @@ type ScanError struct { // func (se *ScanError) Error() string { - return se.msg + return fmt.Sprintf("Error:%d,%d: %s", se.Line, se.CharInLine, se.msg) } From 722163f661c9d4e50b322b1ecd0a22cb6ac1e5d9 Mon Sep 17 00:00:00 2001 From: Geoffrey John Teale Date: Wed, 7 Nov 2018 09:20:31 +0100 Subject: [PATCH 06/24] Import fmt --- rule/sexpr/lexer.go | 1 + 1 file changed, 1 insertion(+) diff --git a/rule/sexpr/lexer.go b/rule/sexpr/lexer.go index 5f547f7..40c70dd 100644 --- a/rule/sexpr/lexer.go +++ b/rule/sexpr/lexer.go @@ -2,6 +2,7 @@ package sexpr import ( "bufio" + "fmt" "io" "unicode" ) From 479d9dfbc3e8a5071882c39b23d5a834899cf7ba Mon Sep 17 00:00:00 2001 From: Geoffrey John Teale Date: Wed, 7 Nov 2018 09:20:49 +0100 Subject: [PATCH 07/24] Initiate lineCount at 1 --- rule/sexpr/lexer.go | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/rule/sexpr/lexer.go b/rule/sexpr/lexer.go index 40c70dd..001ae3a 100644 --- a/rule/sexpr/lexer.go +++ b/rule/sexpr/lexer.go @@ -84,7 +84,10 @@ type Scanner struct { // might scan lexical tokens for the rule symbolic expression language // from it. func NewScanner(r io.Reader) *Scanner { - return &Scanner{r: bufio.NewReader(r)} + return &Scanner{ + r: bufio.NewReader(r), + lineCount: 1, + } } // Scan returns the next lexical token found in the Scanner's io.Reader. @@ -124,7 +127,7 @@ func (s *Scanner) newScanError(message string) *ScanError { return &ScanError{ Byte: s.byteCount, Char: s.charCount, - Line: s.lineCount + 1, + Line: s.lineCount, CharInLine: s.lineCharCount, msg: message, } From 144c287691575e54aee4de1b4c8d2b9c71704058 Mon Sep 17 00:00:00 2001 From: Geoffrey John Teale Date: Wed, 7 Nov 2018 09:21:06 +0100 Subject: [PATCH 08/24] Assert that byte, char, line and char-in-line counts are correct --- rule/sexpr/lexer_internal_test.go | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/rule/sexpr/lexer_internal_test.go b/rule/sexpr/lexer_internal_test.go index 37e5af7..7236924 100644 --- a/rule/sexpr/lexer_internal_test.go +++ b/rule/sexpr/lexer_internal_test.go @@ -160,7 +160,7 @@ func TestNewScanner(t *testing.T) { require.Equal(t, expected, content) } -func assertScanned(t *testing.T, input string, token Token) { +func assertScanned(t *testing.T, input string, token Token, byteCount, charCount, lineCount, lineCharCount int) { t.Run(fmt.Sprintf("Scan %s", input), func(t *testing.T) { b := bytes.NewBufferString(input) s := NewScanner(b) @@ -168,10 +168,15 @@ func assertScanned(t *testing.T, input string, token Token) { require.NoError(t, err) require.Equal(t, token, tok) require.Equal(t, input, lit) + require.Equal(t, byteCount, s.byteCount) + require.Equal(t, charCount, s.charCount) + require.Equal(t, lineCount, s.lineCount) + require.Equal(t, lineCharCount, s.lineCharCount) }) } func TestScannerScan(t *testing.T) { - assertScanned(t, "(", LPAREN) - assertScanned(t, ")", RPAREN) + assertScanned(t, "(", LPAREN, 1, 1, 1, 1) + assertScanned(t, ")", RPAREN, 1, 1, 1, 1) + assertScanned(t, " ", WHITESPACE, 1, 1, 1, 1) } From 60df52673d9a5e91dde2526e4063f9cd6d82f03e Mon Sep 17 00:00:00 2001 From: Geoffrey John Teale Date: Wed, 7 Nov 2018 14:03:27 +0100 Subject: [PATCH 09/24] Scan single character whitespace --- rule/sexpr/lexer.go | 91 ++++++++++++++++++++++++++++--- rule/sexpr/lexer_internal_test.go | 15 +++-- 2 files changed, 94 insertions(+), 12 deletions(-) diff --git a/rule/sexpr/lexer.go b/rule/sexpr/lexer.go index 001ae3a..8a5b2b3 100644 --- a/rule/sexpr/lexer.go +++ b/rule/sexpr/lexer.go @@ -2,9 +2,11 @@ package sexpr import ( "bufio" + "bytes" "fmt" "io" "unicode" + "unicode/utf8" ) type Token int @@ -73,11 +75,12 @@ func isSymbol(r rune) bool { // Scanner is a lexical scanner for extracting the lexical tokens from // a string of characters in our rule symbolic expression language. type Scanner struct { - r *bufio.Reader - byteCount int - charCount int - lineCount int - lineCharCount int + r *bufio.Reader + byteCount int + charCount int + lineCount int + lineCharCount int + previousLineCharCount int } // NewScanner wraps a Scanner around the provided io.Reader so that we @@ -101,12 +104,25 @@ func (s *Scanner) Scan() (Token, string, error) { return LPAREN, "(", nil case isRParen(rn): return RPAREN, ")", nil + case isWhitespace(rn): + err := s.unreadRune(rn) + if err != nil { + return EOF, string(rn), err + } + return s.scanWhitespace() } return EOF, string(rn), s.newScanError("Illegal character scanned") } func (s *Scanner) readRune() (rune, error) { rn, size, err := s.r.ReadRune() + // EOF is a special case, it shouldn't affect counts + if err == io.EOF { + return rn, s.eof() + } + // We need to update the counts correctly before considering + // any error, so that the data embedded in the ScanError is + // correct. s.byteCount += size s.charCount++ s.lineCharCount++ @@ -115,11 +131,65 @@ func (s *Scanner) readRune() (rune, error) { // we can ignore the \r and still get the right // result. s.lineCount++ + // Store the previous line char count in case we unread + s.previousLineCharCount = s.lineCharCount // it's char zero, the next readRune should take us to 1 s.lineCharCount = 0 } - return rn, err + if err != nil { + return rn, s.newScanError(err.Error()) + } + return rn, nil +} + +func (s *Scanner) unreadRune(rn rune) error { + err := s.r.UnreadRune() + if err != nil { + return s.newScanError(err.Error()) + } + // Decrement counts after the unread is complete + s.byteCount -= utf8.RuneLen(rn) + s.charCount-- + s.lineCharCount-- + if rn == '\n' { + s.lineCount-- + s.lineCharCount = s.previousLineCharCount + s.previousLineCharCount-- + } + return nil +} + +// scanWhitespace scans a contiguous sequence of whitespace +// characters. Note that this will consume newlines as it goes, +// lexically speaking they're insignificant to the language. +func (s *Scanner) scanWhitespace() (Token, string, error) { + var b bytes.Buffer + for { + rn, err := s.readRune() + if err != nil { + se := err.(*ScanError) + if se.EOF { + // We'll get EOF next time we try to + // read a rune anyway, so we don't + // have to care about it here, which + // simplifies things. + return WHITESPACE, b.String(), nil + + } + return WHITESPACE, b.String(), err + + } + if !isWhitespace(rn) { + err = s.unreadRune(rn) + if err != nil { + return WHITESPACE, b.String(), err + } + break + } + b.WriteRune(rn) + } + return WHITESPACE, b.String(), nil } // @@ -133,15 +203,22 @@ func (s *Scanner) newScanError(message string) *ScanError { } } +func (s *Scanner) eof() *ScanError { + err := s.newScanError("EOF") + err.EOF = true + return err +} + type ScanError struct { Byte int Char int Line int CharInLine int msg string + EOF bool } // -func (se *ScanError) Error() string { +func (se ScanError) Error() string { return fmt.Sprintf("Error:%d,%d: %s", se.Line, se.CharInLine, se.msg) } diff --git a/rule/sexpr/lexer_internal_test.go b/rule/sexpr/lexer_internal_test.go index 7236924..95b7375 100644 --- a/rule/sexpr/lexer_internal_test.go +++ b/rule/sexpr/lexer_internal_test.go @@ -161,17 +161,17 @@ func TestNewScanner(t *testing.T) { } func assertScanned(t *testing.T, input string, token Token, byteCount, charCount, lineCount, lineCharCount int) { - t.Run(fmt.Sprintf("Scan %s", input), func(t *testing.T) { + t.Run(fmt.Sprintf("Scan %s 0x%x", input, input), func(t *testing.T) { b := bytes.NewBufferString(input) s := NewScanner(b) tok, lit, err := s.Scan() require.NoError(t, err) require.Equal(t, token, tok) require.Equal(t, input, lit) - require.Equal(t, byteCount, s.byteCount) - require.Equal(t, charCount, s.charCount) - require.Equal(t, lineCount, s.lineCount) - require.Equal(t, lineCharCount, s.lineCharCount) + require.Equalf(t, byteCount, s.byteCount, "byteCount") + require.Equalf(t, charCount, s.charCount, "charCount") + require.Equalf(t, lineCount, s.lineCount, "lineCount") + require.Equalf(t, lineCharCount, s.lineCharCount, "lineCharCount") }) } @@ -179,4 +179,9 @@ func TestScannerScan(t *testing.T) { assertScanned(t, "(", LPAREN, 1, 1, 1, 1) assertScanned(t, ")", RPAREN, 1, 1, 1, 1) assertScanned(t, " ", WHITESPACE, 1, 1, 1, 1) + assertScanned(t, "\t", WHITESPACE, 1, 1, 1, 1) + assertScanned(t, "\r", WHITESPACE, 1, 1, 1, 1) + assertScanned(t, "\n", WHITESPACE, 1, 1, 2, 0) + assertScanned(t, "\v", WHITESPACE, 1, 1, 1, 1) + assertScanned(t, "\f", WHITESPACE, 1, 1, 1, 1) } From b56cfc4f0f418157e5b1bfee296e3d59da1e7004 Mon Sep 17 00:00:00 2001 From: Geoffrey John Teale Date: Wed, 7 Nov 2018 14:28:21 +0100 Subject: [PATCH 10/24] Test that we can scan contiguous blocks of whitespace --- rule/sexpr/lexer_internal_test.go | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/rule/sexpr/lexer_internal_test.go b/rule/sexpr/lexer_internal_test.go index 95b7375..a90017b 100644 --- a/rule/sexpr/lexer_internal_test.go +++ b/rule/sexpr/lexer_internal_test.go @@ -160,14 +160,14 @@ func TestNewScanner(t *testing.T) { require.Equal(t, expected, content) } -func assertScanned(t *testing.T, input string, token Token, byteCount, charCount, lineCount, lineCharCount int) { +func assertScanned(t *testing.T, input, output string, token Token, byteCount, charCount, lineCount, lineCharCount int) { t.Run(fmt.Sprintf("Scan %s 0x%x", input, input), func(t *testing.T) { b := bytes.NewBufferString(input) s := NewScanner(b) tok, lit, err := s.Scan() require.NoError(t, err) require.Equal(t, token, tok) - require.Equal(t, input, lit) + require.Equal(t, output, lit) require.Equalf(t, byteCount, s.byteCount, "byteCount") require.Equalf(t, charCount, s.charCount, "charCount") require.Equalf(t, lineCount, s.lineCount, "lineCount") @@ -176,12 +176,19 @@ func assertScanned(t *testing.T, input string, token Token, byteCount, charCount } func TestScannerScan(t *testing.T) { - assertScanned(t, "(", LPAREN, 1, 1, 1, 1) - assertScanned(t, ")", RPAREN, 1, 1, 1, 1) - assertScanned(t, " ", WHITESPACE, 1, 1, 1, 1) - assertScanned(t, "\t", WHITESPACE, 1, 1, 1, 1) - assertScanned(t, "\r", WHITESPACE, 1, 1, 1, 1) - assertScanned(t, "\n", WHITESPACE, 1, 1, 2, 0) - assertScanned(t, "\v", WHITESPACE, 1, 1, 1, 1) - assertScanned(t, "\f", WHITESPACE, 1, 1, 1, 1) + assertScanned(t, "(", "(", LPAREN, 1, 1, 1, 1) + assertScanned(t, ")", ")", RPAREN, 1, 1, 1, 1) + assertScanned(t, " ", " ", WHITESPACE, 1, 1, 1, 1) + assertScanned(t, "\t", "\t", WHITESPACE, 1, 1, 1, 1) + assertScanned(t, "\r", "\r", WHITESPACE, 1, 1, 1, 1) + assertScanned(t, "\n", "\n", WHITESPACE, 1, 1, 2, 0) + assertScanned(t, "\v", "\v", WHITESPACE, 1, 1, 1, 1) + assertScanned(t, "\f", "\f", WHITESPACE, 1, 1, 1, 1) +} + +func TestScannerScanContiguousWhitespace(t *testing.T) { + // Terminated by EOF + assertScanned(t, " ", " ", WHITESPACE, 2, 2, 1, 2) + // Terminated by non-whitespace char + assertScanned(t, " (", " ", WHITESPACE, 2, 2, 1, 2) } From c375c8f387d22ef5cd5cc69fe50e41fce90faec3 Mon Sep 17 00:00:00 2001 From: Geoffrey John Teale Date: Wed, 7 Nov 2018 16:14:03 +0100 Subject: [PATCH 11/24] Assert that we can scan delimited strings, including escaped quotes. --- rule/sexpr/lexer.go | 35 +++++++++++++++++++++++++++++++ rule/sexpr/lexer_internal_test.go | 32 +++++++++++++++++++++++----- 2 files changed, 62 insertions(+), 5 deletions(-) diff --git a/rule/sexpr/lexer.go b/rule/sexpr/lexer.go index 8a5b2b3..97c5065 100644 --- a/rule/sexpr/lexer.go +++ b/rule/sexpr/lexer.go @@ -110,7 +110,10 @@ func (s *Scanner) Scan() (Token, string, error) { return EOF, string(rn), err } return s.scanWhitespace() + case isString(rn): + return s.scanString() } + return EOF, string(rn), s.newScanError("Illegal character scanned") } @@ -192,6 +195,38 @@ func (s *Scanner) scanWhitespace() (Token, string, error) { return WHITESPACE, b.String(), nil } +// scanString returns the contents of single, contiguous, double-quote delimited string constant. +func (s *Scanner) scanString() (Token, string, error) { + var b bytes.Buffer + escape := false + for { + rn, err := s.readRune() + if err != nil { + se := err.(*ScanError) + if se.EOF { + // we reached the end of the file + // without seeing a terminator, that's + // an error. + return STRING, b.String(), s.newScanError("unterminated string constant") + } + return STRING, b.String(), err + } + if escape { + b.WriteRune(rn) + escape = false + continue + } + if isString(rn) { + break + } + escape = rn == '\\' + if !escape { + b.WriteRune(rn) + } + } + return STRING, b.String(), nil +} + // func (s *Scanner) newScanError(message string) *ScanError { return &ScanError{ diff --git a/rule/sexpr/lexer_internal_test.go b/rule/sexpr/lexer_internal_test.go index a90017b..db41a5c 100644 --- a/rule/sexpr/lexer_internal_test.go +++ b/rule/sexpr/lexer_internal_test.go @@ -175,20 +175,42 @@ func assertScanned(t *testing.T, input, output string, token Token, byteCount, c }) } +func assertScanFailed(t *testing.T, input, message string) { + t.Run(fmt.Sprintf("Scan should fail %s 0x%x", input, input), func(t *testing.T) { + b := bytes.NewBufferString(input) + s := NewScanner(b) + _, _, err := s.Scan() + require.EqualError(t, err, message) + }) + +} + func TestScannerScan(t *testing.T) { + // Test L Parenthesis assertScanned(t, "(", "(", LPAREN, 1, 1, 1, 1) + // Test R Parenthesis assertScanned(t, ")", ")", RPAREN, 1, 1, 1, 1) + // Test white-space assertScanned(t, " ", " ", WHITESPACE, 1, 1, 1, 1) assertScanned(t, "\t", "\t", WHITESPACE, 1, 1, 1, 1) assertScanned(t, "\r", "\r", WHITESPACE, 1, 1, 1, 1) assertScanned(t, "\n", "\n", WHITESPACE, 1, 1, 2, 0) assertScanned(t, "\v", "\v", WHITESPACE, 1, 1, 1, 1) assertScanned(t, "\f", "\f", WHITESPACE, 1, 1, 1, 1) -} - -func TestScannerScanContiguousWhitespace(t *testing.T) { - // Terminated by EOF + // Test contiguous white-space: + // - terminated by EOF assertScanned(t, " ", " ", WHITESPACE, 2, 2, 1, 2) - // Terminated by non-whitespace char + // - terminated by non white-space character. assertScanned(t, " (", " ", WHITESPACE, 2, 2, 1, 2) + // Test string: + // - the empty string + assertScanned(t, `""`, "", STRING, 2, 2, 1, 2) + // - the happy case + assertScanned(t, `"foo"`, "foo", STRING, 5, 5, 1, 5) + // - an unterminated sad case + assertScanFailed(t, `"foo`, "Error:1,4: unterminated string constant") + // - happy case with escaped double quote + assertScanned(t, `"foo\""`, `foo"`, STRING, 7, 7, 1, 7) + // - sad case with escaped terminator + assertScanFailed(t, `"foo\"`, "Error:1,6: unterminated string constant") } From 9ba83879c8c1e4fdc97b77a22f5015a1e7042b9c Mon Sep 17 00:00:00 2001 From: Geoffrey John Teale Date: Wed, 7 Nov 2018 16:47:32 +0100 Subject: [PATCH 12/24] Assert scanner scans single digit integers --- rule/sexpr/lexer.go | 32 +++++++++++++++++++++++++++++++ rule/sexpr/lexer_internal_test.go | 5 +++++ 2 files changed, 37 insertions(+) diff --git a/rule/sexpr/lexer.go b/rule/sexpr/lexer.go index 97c5065..35b3314 100644 --- a/rule/sexpr/lexer.go +++ b/rule/sexpr/lexer.go @@ -112,6 +112,12 @@ func (s *Scanner) Scan() (Token, string, error) { return s.scanWhitespace() case isString(rn): return s.scanString() + case isNumber(rn): + err := s.unreadRune(rn) + if err != nil { + return EOF, string(rn), err + } + return s.scanNumber() } return EOF, string(rn), s.newScanError("Illegal character scanned") @@ -227,6 +233,32 @@ func (s *Scanner) scanString() (Token, string, error) { return STRING, b.String(), nil } +// +func (s *Scanner) scanNumber() (Token, string, error) { + var b bytes.Buffer + + for { + rn, err := s.readRune() + if err != nil { + se := err.(*ScanError) + if se.EOF { + // EOF is a valid terminator for a number + return NUMBER, b.String(), nil + } + return NUMBER, b.String(), err + } + if !isNumber(rn) { + err := s.unreadRune(rn) + if err != nil { + return NUMBER, b.String(), err + } + break + } + b.WriteRune(rn) + } + return NUMBER, b.String(), nil +} + // func (s *Scanner) newScanError(message string) *ScanError { return &ScanError{ diff --git a/rule/sexpr/lexer_internal_test.go b/rule/sexpr/lexer_internal_test.go index db41a5c..a0a8bca 100644 --- a/rule/sexpr/lexer_internal_test.go +++ b/rule/sexpr/lexer_internal_test.go @@ -213,4 +213,9 @@ func TestScannerScan(t *testing.T) { assertScanned(t, `"foo\""`, `foo"`, STRING, 7, 7, 1, 7) // - sad case with escaped terminator assertScanFailed(t, `"foo\"`, "Error:1,6: unterminated string constant") + // Test number + // - Single digit integer, EOF terminated + assertScanned(t, "1", "1", NUMBER, 1, 1, 1, 1) + // - Single digit integer, terminated by non-number character + assertScanned(t, "1)", "1", NUMBER, 1, 1, 1, 1) } From 71861975ad19f7f846f26c545292759e4b0b188a Mon Sep 17 00:00:00 2001 From: Geoffrey John Teale Date: Wed, 7 Nov 2018 16:49:11 +0100 Subject: [PATCH 13/24] Assert scanner scans multidigit integers --- rule/sexpr/lexer_internal_test.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/rule/sexpr/lexer_internal_test.go b/rule/sexpr/lexer_internal_test.go index a0a8bca..38ce6cd 100644 --- a/rule/sexpr/lexer_internal_test.go +++ b/rule/sexpr/lexer_internal_test.go @@ -218,4 +218,6 @@ func TestScannerScan(t *testing.T) { assertScanned(t, "1", "1", NUMBER, 1, 1, 1, 1) // - Single digit integer, terminated by non-number character assertScanned(t, "1)", "1", NUMBER, 1, 1, 1, 1) + // - Multi digit integer, EOF terminated + assertScanned(t, "998989", "998989", NUMBER, 6, 6, 1, 6) } From 33e3b9f2b890cf7d40e144246928a1b5a2dd233c Mon Sep 17 00:00:00 2001 From: Geoffrey John Teale Date: Wed, 7 Nov 2018 16:50:30 +0100 Subject: [PATCH 14/24] Assert scanner scans negative integers --- rule/sexpr/lexer_internal_test.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/rule/sexpr/lexer_internal_test.go b/rule/sexpr/lexer_internal_test.go index 38ce6cd..a508ece 100644 --- a/rule/sexpr/lexer_internal_test.go +++ b/rule/sexpr/lexer_internal_test.go @@ -218,6 +218,8 @@ func TestScannerScan(t *testing.T) { assertScanned(t, "1", "1", NUMBER, 1, 1, 1, 1) // - Single digit integer, terminated by non-number character assertScanned(t, "1)", "1", NUMBER, 1, 1, 1, 1) - // - Multi digit integer, EOF terminated + // - Multi-digit integer, EOF terminated assertScanned(t, "998989", "998989", NUMBER, 6, 6, 1, 6) + // - Negative multi-digit integer, EOF terminated + assertScanned(t, "-100", "-100", NUMBER, 4, 4, 1, 4) } From 33a9aa99e3157747244375180e529060a4ca485a Mon Sep 17 00:00:00 2001 From: Geoffrey John Teale Date: Thu, 8 Nov 2018 11:35:52 +0100 Subject: [PATCH 15/24] Support full numeric scanning including '-' differentiation. --- rule/sexpr/lexer.go | 115 +++++++++++++++++++++++------- rule/sexpr/lexer_internal_test.go | 11 ++- 2 files changed, 99 insertions(+), 27 deletions(-) diff --git a/rule/sexpr/lexer.go b/rule/sexpr/lexer.go index 35b3314..bd2a1e3 100644 --- a/rule/sexpr/lexer.go +++ b/rule/sexpr/lexer.go @@ -9,6 +9,8 @@ import ( "unicode/utf8" ) +// Tokens are the fundamental identifier of the lexical scanner. +// Every scanned element will be assigned a token type. type Token int const ( @@ -51,7 +53,7 @@ func isNumber(r rune) bool { // Note, although we allow a number to contain a decimal // point, it can't start with one so we don't include that in // the predicate. - return r == '-' || (r >= '0' && r <= '9') + return r == '-' || unicode.IsDigit(r) } // isBool returns true if the rune is the # (hash or octothorpe) @@ -105,24 +107,19 @@ func (s *Scanner) Scan() (Token, string, error) { case isRParen(rn): return RPAREN, ")", nil case isWhitespace(rn): - err := s.unreadRune(rn) - if err != nil { - return EOF, string(rn), err - } + s.unreadRune(rn) return s.scanWhitespace() case isString(rn): return s.scanString() case isNumber(rn): - err := s.unreadRune(rn) - if err != nil { - return EOF, string(rn), err - } + s.unreadRune(rn) return s.scanNumber() } return EOF, string(rn), s.newScanError("Illegal character scanned") } +// readRune pulls the next rune from the input sequence. func (s *Scanner) readRune() (rune, error) { rn, size, err := s.r.ReadRune() // EOF is a special case, it shouldn't affect counts @@ -151,10 +148,14 @@ func (s *Scanner) readRune() (rune, error) { return rn, nil } -func (s *Scanner) unreadRune(rn rune) error { +// unreadRune puts the last readRune back on the buffer and resets the +// counters. It requires that the rune to be unread is passed, as we +// need to know the byte size of the rune. +func (s *Scanner) unreadRune(rn rune) { err := s.r.UnreadRune() if err != nil { - return s.newScanError(err.Error()) + // This means something truly awful happened! + panic(err.Error()) } // Decrement counts after the unread is complete s.byteCount -= utf8.RuneLen(rn) @@ -166,7 +167,6 @@ func (s *Scanner) unreadRune(rn rune) error { s.previousLineCharCount-- } - return nil } // scanWhitespace scans a contiguous sequence of whitespace @@ -190,10 +190,7 @@ func (s *Scanner) scanWhitespace() (Token, string, error) { } if !isWhitespace(rn) { - err = s.unreadRune(rn) - if err != nil { - return WHITESPACE, b.String(), err - } + s.unreadRune(rn) break } b.WriteRune(rn) @@ -233,10 +230,62 @@ func (s *Scanner) scanString() (Token, string, error) { return STRING, b.String(), nil } -// +// scanNumber scans a contiguous string representing a number. As we +// have to handle the negative numeric form, it's possible that the +// '-' rune can prefix a number. This is problematic because '-' can +// also be a symbol referring to the arithmetic operation "minus" - +// that confusion is resolved by scanNumber, and should it consider +// the latter case to be true it will return a SYMBOL rather than a +// NUMBER. func (s *Scanner) scanNumber() (Token, string, error) { var b bytes.Buffer + // We can be certain this isn't EOF because we will already + // have read and unread the rune before arriving here. + rn, err := s.readRune() + if err != nil { + // Something drastic happened, because we read this fine the first time. + return NUMBER, "", err + } + + // Whatever happens we'll want the rune. + b.WriteRune(rn) + + // Deal with the first rune. Numbers have special rules about + // the first rune, specifically it, and only it, may be the + // minus symbol. When we loop later any occurrence of '-' will + // be an error. + if rn == '-' { + + // Now we look ahead to see if a number is coming, if + // its anything else then this isn't a negative + // number, but some other form. + rn, err := s.readRune() + + // EOF would leave us with a '-' on its own. + // This is never valid, so we can just promote + // the error without bothering to check. + if err != nil { + return NUMBER, b.String(), err + } + + // We've stored the rune, and we know we'll want to + // unread whatever happens, so lets just do that now. + s.unreadRune(rn) + + // If the next rune isn't a digit then we're going to + // assume this is the minus operator and return '-' as + // a symbol instead of a number. There are still + // cases where this wouldn't be valid, but they're all + // errors and we'll leave that for the Parser to + // handle. + if !unicode.IsDigit(rn) { + return SYMBOL, b.String(), nil + } + } + + // OK, let's scan the rest of the number... + for { rn, err := s.readRune() if err != nil { @@ -247,19 +296,26 @@ func (s *Scanner) scanNumber() (Token, string, error) { } return NUMBER, b.String(), err } - if !isNumber(rn) { - err := s.unreadRune(rn) - if err != nil { - return NUMBER, b.String(), err - } - break + if rn == '-' { + // As we said before '-' can't appear in the + // body of a number, this is an error. + return NUMBER, b.String(), s.newScanError("invalid number format (minus can only appear at the beginning of a number)") } - b.WriteRune(rn) + + // Valid number parts are written to the buffer + if isNumber(rn) || rn == '.' { + b.WriteRune(rn) + continue + } + // we hit a terminating character, end the number here. + s.unreadRune(rn) + break } return NUMBER, b.String(), nil } -// +// newScanError returns a ScanError initialised with the current +// positional information of the Scanner. func (s *Scanner) newScanError(message string) *ScanError { return &ScanError{ Byte: s.byteCount, @@ -270,12 +326,17 @@ func (s *Scanner) newScanError(message string) *ScanError { } } +// eof returns a ScanError, initialised with the current positional +// information of the Scanner, and with it's EOF field set to True. func (s *Scanner) eof() *ScanError { err := s.newScanError("EOF") err.EOF = true return err } +// ScanError is a type that implements the Error interface, but adds +// additional context information to errors that can be inspected. It +// is intended to be used for all errors emerging from the Scanner. type ScanError struct { Byte int Char int @@ -285,7 +346,9 @@ type ScanError struct { EOF bool } -// +// Error makes ScanError comply with the Error interface. It returns +// a string representation of the ScanError including it's message and +// some human readable position information. func (se ScanError) Error() string { return fmt.Sprintf("Error:%d,%d: %s", se.Line, se.CharInLine, se.msg) } diff --git a/rule/sexpr/lexer_internal_test.go b/rule/sexpr/lexer_internal_test.go index a508ece..475b0ff 100644 --- a/rule/sexpr/lexer_internal_test.go +++ b/rule/sexpr/lexer_internal_test.go @@ -216,10 +216,19 @@ func TestScannerScan(t *testing.T) { // Test number // - Single digit integer, EOF terminated assertScanned(t, "1", "1", NUMBER, 1, 1, 1, 1) - // - Single digit integer, terminated by non-number character + // - Single digit integer, terminated by non-numeric character assertScanned(t, "1)", "1", NUMBER, 1, 1, 1, 1) // - Multi-digit integer, EOF terminated assertScanned(t, "998989", "998989", NUMBER, 6, 6, 1, 6) // - Negative multi-digit integer, EOF terminated assertScanned(t, "-100", "-100", NUMBER, 4, 4, 1, 4) + // - Floating point number, EOF terminated + assertScanned(t, "2.4", "2.4", NUMBER, 3, 3, 1, 3) + // - long negative float, terminated by non-numeric character + assertScanned(t, "-123.45456 ", "-123.45456", NUMBER, 10, 10, 1, 10) + // - special case: a "-" without a number following it (as per the minus operator) + assertScanned(t, "- 1 2", "-", SYMBOL, 1, 1, 1, 1) + // - sad case: a minus mid-number + assertScanFailed(t, "1-2", "Error:1,2: invalid number format (minus can only appear at the beginning of a number)") + assertScanFailed(t, "-", "Error:1,1: EOF") } From 4f92dfbb77fc1f088afcfc8fe98ef04804b2add7 Mon Sep 17 00:00:00 2001 From: Geoffrey John Teale Date: Fri, 9 Nov 2018 15:16:22 +0100 Subject: [PATCH 16/24] Assert that we can scan Booleans --- rule/sexpr/lexer.go | 39 +++++++++++++++++++++++++++++++ rule/sexpr/lexer_internal_test.go | 22 ++++++++++++++++- 2 files changed, 60 insertions(+), 1 deletion(-) diff --git a/rule/sexpr/lexer.go b/rule/sexpr/lexer.go index bd2a1e3..4b69b3f 100644 --- a/rule/sexpr/lexer.go +++ b/rule/sexpr/lexer.go @@ -114,6 +114,8 @@ func (s *Scanner) Scan() (Token, string, error) { case isNumber(rn): s.unreadRune(rn) return s.scanNumber() + case isBool(rn): + return s.scanBool() } return EOF, string(rn), s.newScanError("Illegal character scanned") @@ -314,6 +316,43 @@ func (s *Scanner) scanNumber() (Token, string, error) { return NUMBER, b.String(), nil } +// scanBool scans the contiguous characters following the '#' symbol, +// it they are either 'true', or 'false' a BOOL is returned, otherwise +// an ScanError will be returned. +func (s *Scanner) scanBool() (Token, string, error) { + + var b bytes.Buffer + + for { + rn, err := s.readRune() + if err != nil { + se := err.(*ScanError) + if se.EOF { + // EOF is a valid terminator for a number + break + } + return BOOL, b.String(), err + } + + // isSymbol is handy shorthand for "it's not anything else" + if !isSymbol(rn) { + s.unreadRune(rn) + break + } + b.WriteRune(rn) + } + + symbol := b.String() + if symbol == "true" || symbol == "false" { + return BOOL, symbol, nil + } + if len(symbol) > 0 { + return BOOL, symbol, s.newScanError(fmt.Sprintf("invalid boolean: %s", symbol)) + } + return BOOL, symbol, s.newScanError("invalid boolean") + +} + // newScanError returns a ScanError initialised with the current // positional information of the Scanner. func (s *Scanner) newScanError(message string) *ScanError { diff --git a/rule/sexpr/lexer_internal_test.go b/rule/sexpr/lexer_internal_test.go index 475b0ff..596c1f1 100644 --- a/rule/sexpr/lexer_internal_test.go +++ b/rule/sexpr/lexer_internal_test.go @@ -185,11 +185,14 @@ func assertScanFailed(t *testing.T, input, message string) { } -func TestScannerScan(t *testing.T) { +func TestScannerScanParenthesis(t *testing.T) { // Test L Parenthesis assertScanned(t, "(", "(", LPAREN, 1, 1, 1, 1) // Test R Parenthesis assertScanned(t, ")", ")", RPAREN, 1, 1, 1, 1) +} + +func TestScannerScanWhiteSpace(t *testing.T) { // Test white-space assertScanned(t, " ", " ", WHITESPACE, 1, 1, 1, 1) assertScanned(t, "\t", "\t", WHITESPACE, 1, 1, 1, 1) @@ -202,6 +205,9 @@ func TestScannerScan(t *testing.T) { assertScanned(t, " ", " ", WHITESPACE, 2, 2, 1, 2) // - terminated by non white-space character. assertScanned(t, " (", " ", WHITESPACE, 2, 2, 1, 2) +} + +func TestScannerScanString(t *testing.T) { // Test string: // - the empty string assertScanned(t, `""`, "", STRING, 2, 2, 1, 2) @@ -213,6 +219,9 @@ func TestScannerScan(t *testing.T) { assertScanned(t, `"foo\""`, `foo"`, STRING, 7, 7, 1, 7) // - sad case with escaped terminator assertScanFailed(t, `"foo\"`, "Error:1,6: unterminated string constant") +} + +func TestScannerScanNumber(t *testing.T) { // Test number // - Single digit integer, EOF terminated assertScanned(t, "1", "1", NUMBER, 1, 1, 1, 1) @@ -230,5 +239,16 @@ func TestScannerScan(t *testing.T) { assertScanned(t, "- 1 2", "-", SYMBOL, 1, 1, 1, 1) // - sad case: a minus mid-number assertScanFailed(t, "1-2", "Error:1,2: invalid number format (minus can only appear at the beginning of a number)") + // - sad case: a minus followed by EOF assertScanFailed(t, "-", "Error:1,1: EOF") } + +func TestScannerScanBool(t *testing.T) { + assertScanned(t, "#true", "true", BOOL, 5, 5, 1, 5) + assertScanned(t, "#false", "false", BOOL, 6, 6, 1, 6) + assertScanFailed(t, "#tru ", "Error:1,4: invalid boolean: tru") + assertScanFailed(t, "#fa)", "Error:1,3: invalid boolean: fa") + assertScanFailed(t, "#1", "Error:1,1: invalid boolean") + assertScanFailed(t, "##", "Error:1,1: invalid boolean") + assertScanFailed(t, "#", "Error:1,1: invalid boolean") +} From 77d61a14a1404ae678050ea0456edfa663aaf276 Mon Sep 17 00:00:00 2001 From: Geoffrey John Teale Date: Fri, 9 Nov 2018 16:38:26 +0100 Subject: [PATCH 17/24] Support scanning comments. --- rule/sexpr/lexer.go | 25 +++++++++++++++++++++++++ rule/sexpr/lexer_internal_test.go | 22 +++++++++++++++++++++- 2 files changed, 46 insertions(+), 1 deletion(-) diff --git a/rule/sexpr/lexer.go b/rule/sexpr/lexer.go index 4b69b3f..bd5ce75 100644 --- a/rule/sexpr/lexer.go +++ b/rule/sexpr/lexer.go @@ -116,6 +116,8 @@ func (s *Scanner) Scan() (Token, string, error) { return s.scanNumber() case isBool(rn): return s.scanBool() + case isComment(rn): + return s.scanComment() } return EOF, string(rn), s.newScanError("Illegal character scanned") @@ -350,7 +352,30 @@ func (s *Scanner) scanBool() (Token, string, error) { return BOOL, symbol, s.newScanError(fmt.Sprintf("invalid boolean: %s", symbol)) } return BOOL, symbol, s.newScanError("invalid boolean") +} + +// scanComment will scan to the end of the current line, consuming any and all chars prior to '\n'. +func (s *Scanner) scanComment() (Token, string, error) { + var b bytes.Buffer + for { + rn, err := s.readRune() + if err != nil { + se := err.(*ScanError) + if se.EOF { + // EOF is a valid terminator for a Comment + break + } + return COMMENT, b.String(), err + } + + if rn == '\n' { + break + } + + b.WriteRune(rn) + } + return COMMENT, b.String(), nil } // newScanError returns a ScanError initialised with the current diff --git a/rule/sexpr/lexer_internal_test.go b/rule/sexpr/lexer_internal_test.go index 596c1f1..c255b56 100644 --- a/rule/sexpr/lexer_internal_test.go +++ b/rule/sexpr/lexer_internal_test.go @@ -244,11 +244,31 @@ func TestScannerScanNumber(t *testing.T) { } func TestScannerScanBool(t *testing.T) { + // Happy cases + // - true, EOF Terminated assertScanned(t, "#true", "true", BOOL, 5, 5, 1, 5) - assertScanned(t, "#false", "false", BOOL, 6, 6, 1, 6) + // - false, newline terminated + assertScanned(t, "#false\n", "false", BOOL, 7, 7, 2, 0) + // Sad cases + // - partial true assertScanFailed(t, "#tru ", "Error:1,4: invalid boolean: tru") + // - partial false assertScanFailed(t, "#fa)", "Error:1,3: invalid boolean: fa") + // - invalid assertScanFailed(t, "#1", "Error:1,1: invalid boolean") + // - repeated signal character assertScanFailed(t, "##", "Error:1,1: invalid boolean") + // - empty assertScanFailed(t, "#", "Error:1,1: invalid boolean") } + +func TestScannerScanComment(t *testing.T) { + // Simple empty comment at EOF + assertScanned(t, ";", "", COMMENT, 1, 1, 1, 1) + // Comment terminated by newline + assertScanned(t, "; Foo\nbar", " Foo", COMMENT, 6, 6, 2, 0) + // Comment containing Comment char + assertScanned(t, ";Pants;On;Fire", "Pants;On;Fire", COMMENT, 14, 14, 1, 14) + // Comment containing control characters + assertScanned(t, `;()"-#1`, `()"-#1`, COMMENT, 7, 7, 1, 7) +} From 77a65f5b06d8375615e223a423e3abcf13059c65 Mon Sep 17 00:00:00 2001 From: Geoffrey John Teale Date: Fri, 9 Nov 2018 17:17:55 +0100 Subject: [PATCH 18/24] We can scan symbols --- rule/sexpr/lexer.go | 41 ++++++++++++++++++++++++++++--- rule/sexpr/lexer_internal_test.go | 19 ++++++++++++++ 2 files changed, 57 insertions(+), 3 deletions(-) diff --git a/rule/sexpr/lexer.go b/rule/sexpr/lexer.go index bd5ce75..a6550bd 100644 --- a/rule/sexpr/lexer.go +++ b/rule/sexpr/lexer.go @@ -118,6 +118,9 @@ func (s *Scanner) Scan() (Token, string, error) { return s.scanBool() case isComment(rn): return s.scanComment() + case isSymbol(rn): + s.unreadRune(rn) + return s.scanSymbol() } return EOF, string(rn), s.newScanError("Illegal character scanned") @@ -266,10 +269,16 @@ func (s *Scanner) scanNumber() (Token, string, error) { // number, but some other form. rn, err := s.readRune() - // EOF would leave us with a '-' on its own. - // This is never valid, so we can just promote - // the error without bothering to check. if err != nil { + se := err.(*ScanError) + if se.EOF { + // In reality having '-' as the final + // symbol in a stream is never useful, + // but this is the sort of error we + // should catch in the Parser, not the + // scanner. + return SYMBOL, b.String(), nil + } return NUMBER, b.String(), err } @@ -378,6 +387,32 @@ func (s *Scanner) scanComment() (Token, string, error) { return COMMENT, b.String(), nil } +// scanSymbol scans a contiguous block of symbol characters. Any non-symbol character will terminate it. +func (s *Scanner) scanSymbol() (Token, string, error) { + var b bytes.Buffer + + for { + rn, err := s.readRune() + if err != nil { + se := err.(*ScanError) + if se.EOF { + // EOF is a valid terminator for a Comment + break + } + return SYMBOL, b.String(), err + } + // Again, we have to special case '-', which can't start a symbol, but can appear in it. + // Likewise numbers. + if !(isSymbol(rn) || rn == '-' || unicode.IsDigit(rn)) { + s.unreadRune(rn) + break + } + b.WriteRune(rn) + } + + return SYMBOL, b.String(), nil +} + // newScanError returns a ScanError initialised with the current // positional information of the Scanner. func (s *Scanner) newScanError(message string) *ScanError { diff --git a/rule/sexpr/lexer_internal_test.go b/rule/sexpr/lexer_internal_test.go index c255b56..8841177 100644 --- a/rule/sexpr/lexer_internal_test.go +++ b/rule/sexpr/lexer_internal_test.go @@ -272,3 +272,22 @@ func TestScannerScanComment(t *testing.T) { // Comment containing control characters assertScanned(t, `;()"-#1`, `()"-#1`, COMMENT, 7, 7, 1, 7) } + +func TestScannerScanSymbol(t *testing.T) { + // Simple, single character identifier + assertScanned(t, "a", "a", SYMBOL, 1, 1, 1, 1) + // Fully formed symbol + assertScanned(t, "abba-sucks-123_ok!", "abba-sucks-123_ok!", SYMBOL, 18, 18, 1, 18) + // Unicode in symbols + assertScanned(t, "mötlěy_crü_sucks_more", "mötlěy_crü_sucks_more", SYMBOL, 24, 21, 1, 21) + // terminated by comment + assertScanned(t, "bon;jovi is worse", "bon", SYMBOL, 3, 3, 1, 3) + // terminated by whitespace + assertScanned(t, "van halen is the worst", "van", SYMBOL, 3, 3, 1, 3) + // terminated by control character + assertScanned(t, "NoWayMichaelBolton)IsTheNadir", "NoWayMichaelBolton", SYMBOL, 18, 18, 1, 18) + // symbol starting with a non-alpha character + assertScanned(t, "+", "+", SYMBOL, 1, 1, 1, 1) + // actually handled by the number scan, but we'll check '-' all the same: + assertScanned(t, "-", "-", SYMBOL, 1, 1, 1, 1) +} From 11633c8d77abbc88d140d1b0bd35bc9ad39a606d Mon Sep 17 00:00:00 2001 From: Geoffrey John Teale Date: Fri, 9 Nov 2018 18:21:55 +0100 Subject: [PATCH 19/24] Assert we scan a full stream correctly --- rule/sexpr/lexer.go | 6 ++- rule/sexpr/lexer_internal_test.go | 68 ++++++++++++++++++++++++++----- 2 files changed, 63 insertions(+), 11 deletions(-) diff --git a/rule/sexpr/lexer.go b/rule/sexpr/lexer.go index a6550bd..ba5528a 100644 --- a/rule/sexpr/lexer.go +++ b/rule/sexpr/lexer.go @@ -99,7 +99,11 @@ func NewScanner(r io.Reader) *Scanner { func (s *Scanner) Scan() (Token, string, error) { rn, err := s.readRune() if err != nil { - return EOF, "", s.newScanError(err.Error()) + se := err.(*ScanError) + if se.EOF { + return EOF, "", nil + } + return EOF, "", err } switch { case isLParen(rn): diff --git a/rule/sexpr/lexer_internal_test.go b/rule/sexpr/lexer_internal_test.go index 8841177..7e346e6 100644 --- a/rule/sexpr/lexer_internal_test.go +++ b/rule/sexpr/lexer_internal_test.go @@ -160,27 +160,36 @@ func TestNewScanner(t *testing.T) { require.Equal(t, expected, content) } +func assertScannerScanned(t *testing.T, s *Scanner, output string, token Token, byteCount, charCount, lineCount, lineCharCount int) { + tok, lit, err := s.Scan() + require.NoError(t, err) + require.Equalf(t, token, tok, "token") + require.Equalf(t, output, lit, "literal") + require.Equalf(t, byteCount, s.byteCount, "byteCount") + require.Equalf(t, charCount, s.charCount, "charCount") + require.Equalf(t, lineCount, s.lineCount, "lineCount") + require.Equalf(t, lineCharCount, s.lineCharCount, "lineCharCount") +} + func assertScanned(t *testing.T, input, output string, token Token, byteCount, charCount, lineCount, lineCharCount int) { t.Run(fmt.Sprintf("Scan %s 0x%x", input, input), func(t *testing.T) { b := bytes.NewBufferString(input) s := NewScanner(b) - tok, lit, err := s.Scan() - require.NoError(t, err) - require.Equal(t, token, tok) - require.Equal(t, output, lit) - require.Equalf(t, byteCount, s.byteCount, "byteCount") - require.Equalf(t, charCount, s.charCount, "charCount") - require.Equalf(t, lineCount, s.lineCount, "lineCount") - require.Equalf(t, lineCharCount, s.lineCharCount, "lineCharCount") + assertScannerScanned(t, s, output, token, byteCount, charCount, lineCount, lineCharCount) }) } +func assertScannerScanFailed(t *testing.T, s *Scanner, message string) { + _, _, err := s.Scan() + require.EqualError(t, err, message) + +} + func assertScanFailed(t *testing.T, input, message string) { t.Run(fmt.Sprintf("Scan should fail %s 0x%x", input, input), func(t *testing.T) { b := bytes.NewBufferString(input) s := NewScanner(b) - _, _, err := s.Scan() - require.EqualError(t, err, message) + assertScannerScanFailed(t, s, message) }) } @@ -291,3 +300,42 @@ func TestScannerScanSymbol(t *testing.T) { // actually handled by the number scan, but we'll check '-' all the same: assertScanned(t, "-", "-", SYMBOL, 1, 1, 1, 1) } + +func TestScannerScanSequence(t *testing.T) { + input := ` +(and + (= (+ 1 -1) 0) + (= my-parameter "fudge sundae")) ; Crazy +` + b := bytes.NewBufferString(input) + s := NewScanner(b) + assertScannerScanned(t, s, "\n", WHITESPACE, 1, 1, 2, 0) + assertScannerScanned(t, s, "(", LPAREN, 2, 2, 2, 1) + assertScannerScanned(t, s, "and", SYMBOL, 5, 5, 2, 5) + assertScannerScanned(t, s, "\n ", WHITESPACE, 8, 8, 3, 2) + assertScannerScanned(t, s, "(", LPAREN, 9, 9, 3, 3) + assertScannerScanned(t, s, "=", SYMBOL, 10, 10, 3, 4) + assertScannerScanned(t, s, " ", WHITESPACE, 11, 11, 3, 5) + assertScannerScanned(t, s, "(", LPAREN, 12, 12, 3, 6) + assertScannerScanned(t, s, "+", SYMBOL, 13, 13, 3, 7) + assertScannerScanned(t, s, " ", WHITESPACE, 14, 14, 3, 8) + assertScannerScanned(t, s, "1", NUMBER, 15, 15, 3, 9) + assertScannerScanned(t, s, " ", WHITESPACE, 16, 16, 3, 10) + assertScannerScanned(t, s, "-1", NUMBER, 18, 18, 3, 12) + assertScannerScanned(t, s, ")", RPAREN, 19, 19, 3, 13) + assertScannerScanned(t, s, " ", WHITESPACE, 20, 20, 3, 14) + assertScannerScanned(t, s, "0", NUMBER, 21, 21, 3, 15) + assertScannerScanned(t, s, ")", RPAREN, 22, 22, 3, 16) + assertScannerScanned(t, s, "\n ", WHITESPACE, 25, 25, 4, 2) + assertScannerScanned(t, s, "(", LPAREN, 26, 26, 4, 3) + assertScannerScanned(t, s, "=", SYMBOL, 27, 27, 4, 4) + assertScannerScanned(t, s, " ", WHITESPACE, 28, 28, 4, 5) + assertScannerScanned(t, s, "my-parameter", SYMBOL, 40, 40, 4, 17) + assertScannerScanned(t, s, " ", WHITESPACE, 41, 41, 4, 18) + assertScannerScanned(t, s, "fudge sundae", STRING, 55, 55, 4, 32) + assertScannerScanned(t, s, ")", RPAREN, 56, 56, 4, 33) + assertScannerScanned(t, s, ")", RPAREN, 57, 57, 4, 34) + assertScannerScanned(t, s, " ", WHITESPACE, 58, 58, 4, 35) + assertScannerScanned(t, s, " Crazy", COMMENT, 66, 66, 5, 0) + assertScannerScanned(t, s, "", EOF, 66, 66, 5, 0) +} From 8ce5c6b4c245e65efe02669e4989ac77b19ff8a3 Mon Sep 17 00:00:00 2001 From: Geoffrey John Teale Date: Fri, 9 Nov 2018 18:26:38 +0100 Subject: [PATCH 20/24] Test scanner returns error in scan immediately --- rule/sexpr/lexer_internal_test.go | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/rule/sexpr/lexer_internal_test.go b/rule/sexpr/lexer_internal_test.go index 7e346e6..41388a3 100644 --- a/rule/sexpr/lexer_internal_test.go +++ b/rule/sexpr/lexer_internal_test.go @@ -301,6 +301,7 @@ func TestScannerScanSymbol(t *testing.T) { assertScanned(t, "-", "-", SYMBOL, 1, 1, 1, 1) } +// Scanner.Scan can scan a full symbollic expression sequence. func TestScannerScanSequence(t *testing.T) { input := ` (and @@ -339,3 +340,15 @@ func TestScannerScanSequence(t *testing.T) { assertScannerScanned(t, s, " Crazy", COMMENT, 66, 66, 5, 0) assertScannerScanned(t, s, "", EOF, 66, 66, 5, 0) } + +func TestScannerScanReturnsScanError(t *testing.T) { + input := ` +(= "toffee` + b := bytes.NewBufferString(input) + s := NewScanner(b) + assertScannerScanned(t, s, "\n", WHITESPACE, 1, 1, 2, 0) + assertScannerScanned(t, s, "(", LPAREN, 2, 2, 2, 1) + assertScannerScanned(t, s, "=", SYMBOL, 3, 3, 2, 2) + assertScannerScanned(t, s, " ", WHITESPACE, 4, 4, 2, 3) + assertScannerScanFailed(t, s, "Error:2,10: unterminated string constant") +} From 5bb5f9ce40c1463c68467be23d65ad6730070403 Mon Sep 17 00:00:00 2001 From: Geoffrey John Teale Date: Fri, 9 Nov 2018 18:41:44 +0100 Subject: [PATCH 21/24] Fix broken test case --- rule/sexpr/lexer_internal_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rule/sexpr/lexer_internal_test.go b/rule/sexpr/lexer_internal_test.go index 41388a3..cfeee64 100644 --- a/rule/sexpr/lexer_internal_test.go +++ b/rule/sexpr/lexer_internal_test.go @@ -257,7 +257,7 @@ func TestScannerScanBool(t *testing.T) { // - true, EOF Terminated assertScanned(t, "#true", "true", BOOL, 5, 5, 1, 5) // - false, newline terminated - assertScanned(t, "#false\n", "false", BOOL, 7, 7, 2, 0) + assertScanned(t, "#false\n", "false", BOOL, 6, 6, 1, 7) // Sad cases // - partial true assertScanFailed(t, "#tru ", "Error:1,4: invalid boolean: tru") From 6bdfc297e4cf7a310754123f0cc143f8c5b69e1a Mon Sep 17 00:00:00 2001 From: Geoffrey John Teale Date: Sat, 10 Nov 2018 06:04:12 +0100 Subject: [PATCH 22/24] Scan nolonger fails on '-' followed by EOF - We'll handle that in the Parser --- rule/sexpr/lexer_internal_test.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/rule/sexpr/lexer_internal_test.go b/rule/sexpr/lexer_internal_test.go index cfeee64..f27e420 100644 --- a/rule/sexpr/lexer_internal_test.go +++ b/rule/sexpr/lexer_internal_test.go @@ -248,8 +248,6 @@ func TestScannerScanNumber(t *testing.T) { assertScanned(t, "- 1 2", "-", SYMBOL, 1, 1, 1, 1) // - sad case: a minus mid-number assertScanFailed(t, "1-2", "Error:1,2: invalid number format (minus can only appear at the beginning of a number)") - // - sad case: a minus followed by EOF - assertScanFailed(t, "-", "Error:1,1: EOF") } func TestScannerScanBool(t *testing.T) { From fbc21033cbe777ac2d46aef75b4e255ff67d3959 Mon Sep 17 00:00:00 2001 From: Asdine El Hrychy Date: Mon, 12 Nov 2018 14:38:32 +0100 Subject: [PATCH 23/24] Correct the term used in a docstring. Co-Authored-By: tealeg --- rule/sexpr/lexer.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rule/sexpr/lexer.go b/rule/sexpr/lexer.go index ba5528a..cfc6a06 100644 --- a/rule/sexpr/lexer.go +++ b/rule/sexpr/lexer.go @@ -400,7 +400,7 @@ func (s *Scanner) scanSymbol() (Token, string, error) { if err != nil { se := err.(*ScanError) if se.EOF { - // EOF is a valid terminator for a Comment + // EOF is a valid terminator for a Symbol break } return SYMBOL, b.String(), err From 31073ad4d05ceacfcf85dd5743868058b28f22a9 Mon Sep 17 00:00:00 2001 From: Yasss Date: Wed, 14 Nov 2018 09:22:18 +0100 Subject: [PATCH 24/24] Update rule/sexpr/lexer.go Co-Authored-By: tealeg --- rule/sexpr/lexer.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rule/sexpr/lexer.go b/rule/sexpr/lexer.go index cfc6a06..d830831 100644 --- a/rule/sexpr/lexer.go +++ b/rule/sexpr/lexer.go @@ -343,7 +343,7 @@ func (s *Scanner) scanBool() (Token, string, error) { if err != nil { se := err.(*ScanError) if se.EOF { - // EOF is a valid terminator for a number + // EOF is a valid terminator for a boolean break } return BOOL, b.String(), err