From 15b2026f05a2237e081817635910d79a28265d35 Mon Sep 17 00:00:00 2001
From: jmeaster30 <jmeasterday@comcast.net>
Date: Sun, 16 Apr 2023 04:29:29 -0400
Subject: [PATCH] Started adding normal regex

---
 .../language-configuration.json               |   6 +-
 .../syntaxes/vore.tmLanguage.json             |   8 +
 libvore/lexer.go                              |  23 +-
 libvore/lexer_test.go                         |   5 +-
 libvore/parser.go                             |   2 +
 libvore/parser_regexp.go                      | 227 ++++++++++++++++++
 libvore/vore.go                               |   2 +-
 libvore/vore_test.go                          |  82 +++++++
 main.go                                       |   2 +
 9 files changed, 351 insertions(+), 6 deletions(-)
 create mode 100644 libvore/parser_regexp.go

diff --git a/libvore-syntax-highlighter/language-configuration.json b/libvore-syntax-highlighter/language-configuration.json
index 37f4f91..238012b 100644
--- a/libvore-syntax-highlighter/language-configuration.json
+++ b/libvore-syntax-highlighter/language-configuration.json
@@ -11,12 +11,14 @@
         ["{", "}"],
         ["(", ")"],
         ["\"", "\""],
-        ["'", "'"]
+        ["'", "'"],
+        ["@/", "/"]
     ],
     "surroundingPairs": [
         ["{", "}"],
         ["(", ")"],
         ["\"", "\""],
-        ["'", "'"]
+        ["'", "'"],
+        ["@/", "/"]
     ]
 }
\ No newline at end of file
diff --git a/libvore-syntax-highlighter/syntaxes/vore.tmLanguage.json b/libvore-syntax-highlighter/syntaxes/vore.tmLanguage.json
index 16b054f..b58b02b 100644
--- a/libvore-syntax-highlighter/syntaxes/vore.tmLanguage.json
+++ b/libvore-syntax-highlighter/syntaxes/vore.tmLanguage.json
@@ -28,6 +28,9 @@
 		},
 		{
 			"include": "#sinstrings"
+		},
+		{ 
+			"include": "#regexp"
 		}
 	],
 	"repository": {
@@ -99,6 +102,11 @@
 					"match": "\\\\."
 				}
 			]
+		},
+		"regexp": {
+			"name": "constant.regexp",
+			"begin": "@/",
+			"end": "/"
 		}
 	},
 	"scopeName": "source.vore"
diff --git a/libvore/lexer.go b/libvore/lexer.go
index cb1e189..9386ab1 100644
--- a/libvore/lexer.go
+++ b/libvore/lexer.go
@@ -22,6 +22,7 @@ const (
 	IDENTIFIER
 	NUMBER
 	STRING
+	REGEXP
 
 	// misc
 	EQUAL
@@ -257,6 +258,8 @@ func (t TokenType) PP() string {
 		return "FALSE"
 	case CASELESS:
 		return "CASELESS"
+	case REGEXP:
+		return "REGEXP"
 	default:
 		panic("UNKNOWN TOKEN TYPE")
 	}
@@ -350,6 +353,7 @@ func (s *Lexer) getNextToken() (*Token, error) {
 		SDASH
 		SOPERATOR
 		SOPERATORSTART
+		SREGEXP
 		SERROR
 		SEND
 	)
@@ -527,8 +531,23 @@ func (s *Lexer) getNextToken() (*Token, error) {
 		} else if current_state == SCOMMENTSTART {
 			s.unread_last()
 			current_state = SCOMMENT
+		} else if current_state == SSTART && ch == '@' {
+			next_ch := s.read()
+			if next_ch != '/' {
+				s.unread_last()
+				current_state = SERROR
+				break
+			}
+			curr_ch := s.read()
+			for curr_ch != '/' {
+				buf.WriteRune(curr_ch)
+				curr_ch = s.read()
+			}
+
+			current_state = SREGEXP
+			break
 		} else {
-			if current_state != SSTART || unicode.IsDigit(ch) || unicode.IsLetter(ch) || unicode.IsSpace(ch) || ch == '(' || ch == ')' || ch == '{' || ch == '}' || ch == ',' || ch == ':' || ch == '=' || ch == '"' || ch == '\'' || ch == '-' || ch == '+' || ch == '<' || ch == '>' || ch == '*' || ch == '/' || ch == '%' {
+			if current_state != SSTART || unicode.IsDigit(ch) || unicode.IsLetter(ch) || unicode.IsSpace(ch) || ch == '(' || ch == ')' || ch == '{' || ch == '}' || ch == ',' || ch == ':' || ch == '=' || ch == '"' || ch == '\'' || ch == '-' || ch == '+' || ch == '<' || ch == '>' || ch == '*' || ch == '/' || ch == '%' || ch == '@' {
 				s.unread_last()
 			} else {
 				buf.WriteRune(ch)
@@ -555,6 +574,8 @@ func (s *Lexer) getNextToken() (*Token, error) {
 		token.TokenType = STRING
 	case SNUMBER:
 		token.TokenType = NUMBER
+	case SREGEXP:
+		token.TokenType = REGEXP
 	case SIDENTIFIER:
 		token.TokenType = IDENTIFIER
 		lexeme := strings.ToLower(buf.String())
diff --git a/libvore/lexer_test.go b/libvore/lexer_test.go
index f8c8d82..a887bbc 100644
--- a/libvore/lexer_test.go
+++ b/libvore/lexer_test.go
@@ -107,10 +107,10 @@ func TestCheckUnendingBlockCommentError2(t *testing.T) {
 }
 
 func TestCheckUnknownToken(t *testing.T) {
-	lexer := initLexer(strings.NewReader("ident @"))
+	lexer := initLexer(strings.NewReader("ident $"))
 	tokens, err := lexer.getTokens()
 
-	checkVoreErrorToken(t, err, "LexError", ERROR, "@", 6, 7, "Unknown token :(")
+	checkVoreErrorToken(t, err, "LexError", ERROR, "$", 6, 7, "Unknown token :(")
 
 	if len(tokens) != 0 {
 		t.Errorf("Expected no tokens returned on error. Got %d tokens", len(tokens))
@@ -200,4 +200,5 @@ func TestTokenTypePP(t *testing.T) {
 	ppMatch(t, FALSE, "FALSE")
 	ppMatch(t, WHOLE, "WHOLE")
 	ppMatch(t, CASELESS, "CASELESS")
+	ppMatch(t, REGEXP, "REGEXP")
 }
diff --git a/libvore/parser.go b/libvore/parser.go
index 1ba3e0d..100dbc4 100644
--- a/libvore/parser.go
+++ b/libvore/parser.go
@@ -304,6 +304,8 @@ func parse_expression(tokens []*Token, token_index int) (AstExpression, int, err
 		return parse_subroutine(tokens, token_index)
 	} else if current_token.TokenType == NOT {
 		return parse_not_expression(tokens, token_index)
+	} else if current_token.TokenType == REGEXP {
+		return parse_regexp(tokens, token_index)
 	} else if current_token.TokenType == STRING || current_token.TokenType == IDENTIFIER ||
 		current_token.TokenType == OPENPAREN || current_token.TokenType == ANY ||
 		current_token.TokenType == WHITESPACE || current_token.TokenType == DIGIT ||
diff --git a/libvore/parser_regexp.go b/libvore/parser_regexp.go
new file mode 100644
index 0000000..32d6fc8
--- /dev/null
+++ b/libvore/parser_regexp.go
@@ -0,0 +1,227 @@
+package libvore
+
+import (
+	"strconv"
+)
+
+// This is mostly built off of the ECMAScript spec for regular expressions
+// https://262.ecma-international.org/13.0/#sec-patterns
+// this was mostly because it was the easiest to find the grammar
+
+/*
+
+PATTERN :: DISJUNCTION
+
+DISJUNCTION :: ALTERNATIVE
+			:: ALTERNATIVE | DISJUNCTION
+
+ALTERNATIVE :: [empty]
+			:: ALTERNATIVE TERM
+
+TERM :: ASSERTION
+	 :: ATOM
+	 :: ATOM QUANTIFIER
+
+ASSERTION :: ^
+		  :: $
+		  :: \ b
+		  :: \ B
+		  :: ( ? = DISJUNCTION )
+		  :: ( ? ! DISJUNCTION )
+		  :: ( ? < = DISJUNCTION )
+		  :: ( ? < ! DISJUNCTION )
+
+QUANTIFIER :: QUANTIFIERPREFIX
+		   :: QUANTIFIERPREFIX ?
+
+QUANTIFIERPREFIX :: *
+			     :: +
+				 :: ?
+				 :: { DECIMALDIGITS }
+				 :: { DECIMALDIGITS , }
+				 :: { DECIMALDIGITS , DECIMALDIGITS }
+
+ATOM :: PATTERNCHARACTER
+	 :: .
+	 :: \ ATOMESCAPE
+	 :: CHARACTERCLASS
+	 :: ( GROUPSPECIFIER DISJUNCTION )
+	 :: ( ? : DISJUNCTION )
+
+SYNTAXCHARACTER :: one of ^ $ \ . * + ? ( ) [ ] { }
+
+PATTERNCHARACTER :: any single character except for SYNTAXCHARACTER
+
+ATOMESCAPE :: unimplemented
+
+CHARACTERCLASS :: unimplemented
+
+GROUPSPECIFIER :: [empty]
+			   :: ? GROUPNAME
+
+GROUPNAME :: < IDENTIFIER >     -- will break ECMAScript standard here and just do the same style identifiers in vore
+
+*/
+
+func parse_regexp(tokens []*Token, token_index int) (AstExpression, int, error) {
+	regexp_token := tokens[token_index]
+	regexp := regexp_token.Lexeme
+
+	index := 0
+	results := []AstExpression{}
+	for index < len(regexp) {
+		exp, next_index, err := parse_regexp_pattern(regexp_token, regexp, index)
+		if err != nil {
+			return nil, next_index, err
+		}
+		results = append(results, exp)
+		index = next_index
+	}
+
+	s := &AstPrimary{
+		&AstSubExpr{results},
+	}
+
+	return s, token_index + 1, nil
+}
+
+func parse_regexp_pattern(regexp_token *Token, regexp string, index int) (AstExpression, int, error) {
+
+	start, next_index, err := parse_regexp_literal(regexp_token, regexp, index)
+	if err != nil {
+		return nil, next_index, err
+	}
+
+	if next_index >= len(regexp) {
+		return &AstPrimary{start}, next_index, nil
+	}
+
+	op := regexp[next_index]
+	if op == '*' {
+		fewest := next_index+1 < len(regexp) && regexp[next_index+1] == '?'
+		exp := &AstLoop{0, -1, fewest, &AstPrimary{start}, ""}
+		if fewest {
+			return exp, next_index + 2, nil
+		} else {
+			return exp, next_index + 1, nil
+		}
+	} else if op == '+' {
+		fewest := next_index+1 < len(regexp) && regexp[next_index+1] == '?'
+		exp := &AstLoop{1, -1, fewest, &AstPrimary{start}, ""}
+		if fewest {
+			return exp, next_index + 2, nil
+		} else {
+			return exp, next_index + 1, nil
+		}
+	} else if op == '?' {
+		fewest := next_index+1 < len(regexp) && regexp[next_index+1] == '?'
+		exp := &AstLoop{0, 1, fewest, &AstPrimary{start}, ""}
+		if fewest {
+			return exp, next_index + 2, nil
+		} else {
+			return exp, next_index + 1, nil
+		}
+	} else if op == '{' {
+		from, idx, err := parse_regexp_number(regexp_token, regexp, next_index+1)
+		if err != nil {
+			return nil, idx, err
+		}
+		comma_or_brace := regexp[idx]
+		var end_idx int
+		var exp *AstLoop
+		if comma_or_brace == ',' {
+			if regexp[idx+1] == '}' {
+				exp = &AstLoop{from, -1, false, &AstPrimary{start}, ""}
+				end_idx = idx + 1
+			} else {
+				to, idx2, err := parse_regexp_number(regexp_token, regexp, idx+1)
+				if err != nil {
+					return nil, idx, err
+				}
+				brace := regexp[idx2]
+				if brace != '}' {
+					return nil, idx2, NewParseError(*regexp_token, "Unexpected character. Expected '}'")
+				}
+
+				exp = &AstLoop{from, to, false, &AstPrimary{start}, ""}
+				end_idx = idx2 + 1
+			}
+		} else if comma_or_brace == '}' {
+			exp = &AstLoop{from, from, false, &AstPrimary{start}, ""}
+			end_idx = idx + 1
+		}
+		exp.fewest = end_idx+1 < len(regexp) && regexp[end_idx+1] == '?'
+		if exp.fewest {
+			return exp, end_idx + 2, nil
+		} else {
+			return exp, end_idx + 1, nil
+		}
+	} else if op == '|' {
+		end, idx, err := parse_regexp_pattern(regexp_token, regexp, next_index+1)
+		return &AstBranch{start, end}, idx, err
+	}
+
+	return &AstPrimary{start}, next_index, nil
+}
+
+func parse_regexp_number(regexp_token *Token, regexp string, index int) (int, int, error) {
+	c := regexp[index]
+	result := ""
+	idx := index
+	for c >= '0' && c <= '9' {
+		result += string(c)
+		idx += 1
+		c = regexp[idx]
+	}
+	if result == "" {
+		return -1, index, NewParseError(*regexp_token, "Unexpected Token. Expected number")
+	}
+	value, err := strconv.Atoi(result)
+	if err != nil {
+		return value, idx, NewParseError(*regexp_token, "Error converting string to number")
+	}
+	return value, idx, nil
+
+}
+
+func parse_regexp_literal(regexp_token *Token, regexp string, index int) (AstLiteral, int, error) {
+	c := regexp[index]
+	var start AstLiteral
+	next_index := index
+	if c == '^' {
+		start = &AstCharacterClass{false, ClassLineStart}
+		next_index += 1
+	} else if c == '$' {
+		start = &AstCharacterClass{false, ClassLineEnd}
+		next_index += 1
+	} else if c == '\\' {
+		exp, idx, err := parse_regexp_escape_characters(regexp_token, regexp, index+1)
+		if err != nil {
+			return nil, index, err
+		}
+		start = exp
+		next_index = idx
+	} else if c == '(' {
+		exp, idx, err := parse_regexp_groups(regexp_token, regexp, index+1)
+		if err != nil {
+			return nil, index, err
+		}
+		start = exp
+		next_index = idx
+	} else if c == '.' {
+		start = &AstString{true, "\n", false}
+		next_index += 1
+	} else {
+		start = &AstString{false, string(c), false}
+		next_index += 1
+	}
+	return start, next_index, nil
+}
+
+func parse_regexp_escape_characters(regexp_token *Token, regexp string, index int) (AstLiteral, int, error) {
+	return nil, index, nil
+}
+
+func parse_regexp_groups(regexp_token *Token, regexp string, index int) (AstLiteral, int, error) {
+	return nil, index, nil
+}
diff --git a/libvore/vore.go b/libvore/vore.go
index 7754d39..fa94396 100644
--- a/libvore/vore.go
+++ b/libvore/vore.go
@@ -293,7 +293,7 @@ func printString(str ValueString) {
 func (m Match) Print() {
 	fmt.Printf("Filename: %s\n", m.Filename)
 	fmt.Printf("MatchNumber: %d\n", m.MatchNumber)
-	fmt.Printf("Value: %s\n", m.Value)
+	fmt.Printf("Value: '%s'\n", m.Value)
 	if m.Replacement.HasValue() {
 		fmt.Printf("Replaced: %s\n", m.Replacement.GetValue())
 	}
diff --git a/libvore/vore_test.go b/libvore/vore_test.go
index 12e7fa3..3a51811 100644
--- a/libvore/vore_test.go
+++ b/libvore/vore_test.go
@@ -381,3 +381,85 @@ func TestCaseless(t *testing.T) {
 		{64, "tEsT", None[string](), []TestVar{}},
 	})
 }
+
+func TestRegexp(t *testing.T) {
+	vore, err := Compile("find all @/a+b*/")
+	checkNoError(t, err)
+	results := vore.Run("aaabbb ab a")
+	matches(t, results, []TestMatch{
+		{0, "aaabbb", None[string](), []TestVar{}},
+		{7, "ab", None[string](), []TestVar{}},
+		{10, "a", None[string](), []TestVar{}},
+	})
+}
+
+func TestRegexp2(t *testing.T) {
+	vore, err := Compile("find all @/a*?b/")
+	checkNoError(t, err)
+	results := vore.Run("aaabbb ab a")
+	matches(t, results, []TestMatch{
+		{0, "aaab", None[string](), []TestVar{}},
+		{4, "b", None[string](), []TestVar{}},
+		{5, "b", None[string](), []TestVar{}},
+		{7, "ab", None[string](), []TestVar{}},
+	})
+}
+
+func TestRegexp3(t *testing.T) {
+	vore, err := Compile("find all @/a+?b?/")
+	checkNoError(t, err)
+	results := vore.Run("aaabbb ab a")
+	matches(t, results, []TestMatch{
+		{0, "a", None[string](), []TestVar{}},
+		{1, "a", None[string](), []TestVar{}},
+		{2, "ab", None[string](), []TestVar{}},
+		{7, "ab", None[string](), []TestVar{}},
+		{10, "a", None[string](), []TestVar{}},
+	})
+}
+
+func TestRegexp4(t *testing.T) {
+	vore, err := Compile("find all @/a{4,7}/")
+	checkNoError(t, err)
+	results := vore.Run(`aaaaaaaa
+	aaa aaaaaa`)
+	matches(t, results, []TestMatch{
+		{0, "aaaaaaa", None[string](), []TestVar{}},
+		{14, "aaaaaa", None[string](), []TestVar{}},
+	})
+}
+
+func TestRegexp5(t *testing.T) {
+	vore, err := Compile("find all @/a{4,}/")
+	checkNoError(t, err)
+	results := vore.Run(`aaaaaaaa
+	aaa aaaaaa`)
+	matches(t, results, []TestMatch{
+		{0, "aaaaaaaa", None[string](), []TestVar{}},
+		{14, "aaaaaa", None[string](), []TestVar{}},
+	})
+}
+
+func TestRegexp6(t *testing.T) {
+	vore, err := Compile("find all @/a{4}/")
+	checkNoError(t, err)
+	results := vore.Run(`aaaaaaaa
+	aaa aaaaaa`)
+	matches(t, results, []TestMatch{
+		{0, "aaaa", None[string](), []TestVar{}},
+		{4, "aaaa", None[string](), []TestVar{}},
+		{14, "aaaa", None[string](), []TestVar{}},
+	})
+}
+
+func TestRegexp7(t *testing.T) {
+	vore, err := Compile("find all @/a{4,}?/")
+	checkNoError(t, err)
+	results := vore.Run(`aaaaaaaa
+	aaa aaaaaa`)
+	matches(t, results, []TestMatch{
+		{0, "aaaa", None[string](), []TestVar{}},
+		{4, "aaaa", None[string](), []TestVar{}},
+		{14, "aaaa", None[string](), []TestVar{}},
+	})
+}
diff --git a/main.go b/main.go
index a9c1a55..4e9e1fd 100644
--- a/main.go
+++ b/main.go
@@ -100,6 +100,8 @@ func main() {
 		os.Exit(1)
 	}
 
+	//vore.PrintAST()
+
 	results := vore.RunFiles([]string{*files_arg}, replaceModeArg, process_filenames)
 
 	if no_output { // skip all output