From 856c2ae7768d8fb36cffdb434d7349eae5b888ae Mon Sep 17 00:00:00 2001
From: Federico Ficarelli <federico.ficarelli@gmail.com>
Date: Tue, 12 Nov 2019 18:00:58 +0100
Subject: [PATCH] Add remaining golden file tests

---
 cool.tm                                       |  4 +-
 lexer_test.go                                 | 70 +++++++++++--------
 testdata/escapednull.cool.lexer.gold.json     |  7 ++
 .../null_in_string.cl.cool.lexer.gold.json    |  7 ++
 ...followed_by_tokens.cl.cool.lexer.gold.json | 17 +++++
 5 files changed, 75 insertions(+), 30 deletions(-)
 create mode 100644 testdata/escapednull.cool.lexer.gold.json
 create mode 100644 testdata/null_in_string.cl.cool.lexer.gold.json
 create mode 100644 testdata/null_in_string_followed_by_tokens.cl.cool.lexer.gold.json

diff --git a/cool.tm b/cool.tm
index 230a6f4..528abb3 100644
--- a/cool.tm
+++ b/cool.tm
@@ -74,8 +74,8 @@ invalid_token: /"({strRune}*(\\?\x00){strRune}*)+"/
 #    Note: It's unclear from the language spec whether multiple unescaped '\n'
 #          should produce a single invalid token or not. No golden files with
 #          this case are available but 's19.test.cool' shows that a single '\n'
-#          splits the invalid literal in two halves. Leaving the rule commented
-#          out and looking for clarifications.
+#          splits the invalid literal in two lexable halves. Leaving the rule
+#          commented out while looking for clarifications.
 # invalid_token: /"({strRune}*([^\\]?\n){strRune}*)+"/  # <- This needs backtracking!
 StringLiteral: /"{strRune}*"/
 
diff --git a/lexer_test.go b/lexer_test.go
index 3bc4ec5..b68f95d 100644
--- a/lexer_test.go
+++ b/lexer_test.go
@@ -1,13 +1,12 @@
 package cool
 
 import (
-	"bytes"
 	"encoding/json"
 	"github.com/google/go-cmp/cmp"
 	"io/ioutil"
 	"log"
+	"path"
 	"testing"
-	"text/template"
 )
 
 type SourceToken struct {
@@ -19,7 +18,7 @@ type SourceToken struct {
 func TestLexerSnippets(t *testing.T) {
 	for _, tt := range testSnippets {
 		t.Run(tt.name, func(t *testing.T) {
-			got := scan(tt.source)
+			got := scanTerminals(tt.source)
 			if diff := cmp.Diff(tt.tokens, got); diff != "" {
 				t.Errorf("lex mismatch (-want +got):\n%s", diff)
 			}
@@ -27,27 +26,18 @@ func TestLexerSnippets(t *testing.T) {
 	}
 }
 
-func TestLexerFiles(t *testing.T) {
-	temp, err := template.New("golden").Parse("{{.}}.lexer.gold.json")
-	if err != nil {
-		panic(err)
-	}
+func TestLexerGoldFiles(t *testing.T) {
 	for _, sourceFileName := range testFiles {
-		t.Run(sourceFileName, func(t *testing.T) {
+		t.Run(path.Base(sourceFileName), func(t *testing.T) {
 			// Source
 			sourceBuf, err := ioutil.ReadFile(sourceFileName)
 			if err != nil {
 				log.Fatalln(err)
 			}
 			source := string(sourceBuf)
-			sourceTokens := scanSource(source)
+			sourceTokens := scanSourceTokens(source)
 			// Golden
-			var b bytes.Buffer
-			err = temp.Execute(&b, sourceFileName)
-			if err != nil {
-				log.Fatalln(err)
-			}
-			goldFileName := b.String()
+			goldFileName := sourceFileName + ".lexer.gold.json"
 			goldBuf, err := ioutil.ReadFile(goldFileName)
 			if err != nil {
 				log.Fatalln(err)
@@ -57,15 +47,29 @@ func TestLexerFiles(t *testing.T) {
 			if err != nil {
 				log.Fatalln(err)
 			}
-			// Compare
-			if diff := cmp.Diff(goldTokens, sourceTokens); diff != "" {
-				t.Errorf("lex mismatch (-want +got):\n%s", diff)
-			}
+			t.Run("Terminals", func(t *testing.T) {
+				var goldTokenIds []Token
+				for _, tok := range goldTokens {
+					goldTokenIds = append(goldTokenIds, tok.Terminal)
+				}
+				var sourceTokenIds []Token
+				for _, tok := range sourceTokens {
+					sourceTokenIds = append(sourceTokenIds, tok.Terminal)
+				}
+				if diff := cmp.Diff(goldTokenIds, sourceTokenIds); diff != "" {
+					t.Errorf("lex mismatch (-want +got):\n%s", diff)
+				}
+			})
+			t.Run("Values", func(t *testing.T) {
+				if diff := cmp.Diff(goldTokens, sourceTokens); diff != "" {
+					t.Errorf("lex mismatch (-want +got):\n%s", diff)
+				}
+			})
 		})
 	}
 }
 
-func scanSource(source string) []SourceToken {
+func scanSourceTokens(source string) []SourceToken {
 	var lex Lexer
 	lex.Init(source)
 	var tokens []SourceToken
@@ -75,9 +79,9 @@ func scanSource(source string) []SourceToken {
 	return tokens
 }
 
-func scan(source string) []Token {
+func scanTerminals(source string) []Token {
 	var tokens []Token
-	for _, t := range scanSource(source) {
+	for _, t := range scanSourceTokens(source) {
 		tokens = append(tokens, t.Terminal)
 	}
 	return tokens
@@ -100,6 +104,8 @@ var testSnippets = []struct {
 	{"Identifier", "object Type oBJECT", []Token{OBJECTID, TYPEID, OBJECTID}},
 	{"IntegerLiteral", "0 000 0000 01234567890", []Token{INTEGERLITERAL, INTEGERLITERAL, INTEGERLITERAL, INTEGERLITERAL}},
 	{"StringLiteral", "\"\" \" \" \" foo \"", []Token{STRINGLITERAL, STRINGLITERAL, STRINGLITERAL}},
+	{"StringLiteralEscapes", "\" \\a\\b\\\"\\c\\\"\\d\\\\\\\n \"", []Token{STRINGLITERAL}},
+	{"EmptyStringLiteral", "\"\"", []Token{STRINGLITERAL}},
 	{"Whitespace", "    \t\t \f \v \r\r\r\n\n      ", nil},
 	{"BoolLiteral", "true false tRUE fALSE True False", []Token{BOOLLITERAL, BOOLLITERAL, BOOLLITERAL, BOOLLITERAL, TYPEID, TYPEID}},
 	{"KeywordClass", "class CLASS Class cLASS", []Token{CLASS, CLASS, CLASS, CLASS}},
@@ -164,6 +170,16 @@ var testSnippets = []struct {
 	{"TokenAndInvalidRSub", "a ] a", []Token{OBJECTID, INVALID_TOKEN, OBJECTID}},
 	{"TokenAndInvalidBackslash", "a \\ a", []Token{OBJECTID, INVALID_TOKEN, OBJECTID}},
 	{"TokenAndInvalidPipe", "a | a", []Token{OBJECTID, INVALID_TOKEN, OBJECTID}},
+	{"InvalidNull", "\x00", []Token{INVALID_TOKEN}},
+	{"TokenAndInvalidNull", "a \x00 a\x00a", []Token{OBJECTID, INVALID_TOKEN, OBJECTID, INVALID_TOKEN, OBJECTID}},
+	{"OneNullInStringLiteral", "\"this is a string \x00 literal\"", []Token{INVALID_TOKEN}},
+	{"TwoNullInStringLiteral", "\"this is \x00 a string \x00 literal\"", []Token{INVALID_TOKEN}},
+	{"OneEscapedNullInStringLiteral", "\"this is an ill formed string \\\x00 literal\"", []Token{INVALID_TOKEN}},
+	{"TwoEscapedNullInStringLiteral", "\"this is \\\x00 an ill formed string \\\x00 literal\"", []Token{INVALID_TOKEN}},
+	// Unescaped '\n' split string literals in lexable chunks.
+	// Unclear if it is the right behaviour, no hints from the lang spec.
+	// See the 'StringLiteral' rule for details.
+	{"StringLiteralUnsecapedNewline", "\"a \n b\"", []Token{INVALID_TOKEN, OBJECTID, INVALID_TOKEN}},
 }
 
 var testFiles = []string{
@@ -227,9 +243,7 @@ var testFiles = []string{
 	"testdata/objectid.test.cool",
 	"testdata/palindrome.cool",
 	"testdata/sort_list.cl.cool",
+	"testdata/escapednull.cool",
+	"testdata/null_in_string.cl.cool",
+	"testdata/null_in_string_followed_by_tokens.cl.cool",
 }
-
-// TODO
-// FAIL {"testdata/escapednull.cool", []Token{INVALID_TOKEN}},
-// FAIL {"testdata/null_in_string.cl.cool", []Token{INVALID_TOKEN}},
-// FAIL {"testdata/null_in_string_followed_by_tokens.cl.cool", []Token{INVALID_TOKEN, OBJECTID, PLUS}},
diff --git a/testdata/escapednull.cool.lexer.gold.json b/testdata/escapednull.cool.lexer.gold.json
new file mode 100644
index 0000000..0bab3fa
--- /dev/null
+++ b/testdata/escapednull.cool.lexer.gold.json
@@ -0,0 +1,7 @@
+[
+	{
+		"line": 1,
+		"token": 1,
+		"source": "\"This contains an escaped null character \\\u0000\""
+	}
+]
diff --git a/testdata/null_in_string.cl.cool.lexer.gold.json b/testdata/null_in_string.cl.cool.lexer.gold.json
new file mode 100644
index 0000000..6283e4e
--- /dev/null
+++ b/testdata/null_in_string.cl.cool.lexer.gold.json
@@ -0,0 +1,7 @@
+[
+	{
+		"line": 2,
+		"token": 1,
+		"source": "\"null character is here -\u003e\u0000\u003c-\""
+	}
+]
diff --git a/testdata/null_in_string_followed_by_tokens.cl.cool.lexer.gold.json b/testdata/null_in_string_followed_by_tokens.cl.cool.lexer.gold.json
new file mode 100644
index 0000000..4705460
--- /dev/null
+++ b/testdata/null_in_string_followed_by_tokens.cl.cool.lexer.gold.json
@@ -0,0 +1,17 @@
+[
+	{
+		"line": 2,
+		"token": 1,
+		"source": "\"null character is here -\u003e\u0000\u003c-\""
+	},
+	{
+		"line": 2,
+		"token": 7,
+		"source": "a"
+	},
+	{
+		"line": 2,
+		"token": 40,
+		"source": "+"
+	}
+]