Started parsing character sets

jmeaster30 · Apr 19, 2023 · c6acc4f · c6acc4f
1 parent 70658ae
commit c6acc4f
Show file tree

Hide file tree

Showing 2 changed files with 141 additions and 55 deletions.
diff --git a/libvore/parser_regexp.go b/libvore/parser_regexp.go
@@ -4,63 +4,11 @@ import (
 	"strconv"
 )
 
-// This is mostly built off of the ECMAScript spec for regular expressions
-// https://262.ecma-international.org/13.0/#sec-patterns
-// this was mostly because it was the easiest to find the grammar
-
 /*
+   This is mostly built off of the ECMAScript spec for regular expressions
+   https://262.ecma-international.org/13.0/#sec-patterns
 
-PATTERN :: DISJUNCTION
-
-DISJUNCTION :: ALTERNATIVE
-			:: ALTERNATIVE | DISJUNCTION
-
-ALTERNATIVE :: [empty]
-			:: ALTERNATIVE TERM
-
-TERM :: ASSERTION
-	 :: ATOM
-	 :: ATOM QUANTIFIER
-
-ASSERTION :: ^
-		  :: $
-		  :: \ b
-		  :: \ B
-		  :: ( ? = DISJUNCTION )
-		  :: ( ? ! DISJUNCTION )
-		  :: ( ? < = DISJUNCTION )
-		  :: ( ? < ! DISJUNCTION )
-
-QUANTIFIER :: QUANTIFIERPREFIX
-		   :: QUANTIFIERPREFIX ?
-
-QUANTIFIERPREFIX :: *
-			     :: +
-				 :: ?
-				 :: { DECIMALDIGITS }
-				 :: { DECIMALDIGITS , }
-				 :: { DECIMALDIGITS , DECIMALDIGITS }
-
-ATOM :: PATTERNCHARACTER
-	 :: .
-	 :: \ ATOMESCAPE
-	 :: CHARACTERCLASS
-	 :: ( GROUPSPECIFIER DISJUNCTION )
-	 :: ( ? : DISJUNCTION )
-
-SYNTAXCHARACTER :: one of ^ $ \ . * + ? ( ) [ ] { }
-
-PATTERNCHARACTER :: any single character except for SYNTAXCHARACTER
-
-ATOMESCAPE :: unimplemented
-
-CHARACTERCLASS :: unimplemented
-
-GROUPSPECIFIER :: [empty]
-			   :: ? GROUPNAME
-
-GROUPNAME :: < IDENTIFIER >     -- will break ECMAScript standard here and just do the same style identifiers in vore
-
+   I will be making some changes to the grammar but I do want it to be as close to the specification as possible
 */
 
 func parse_regexp(tokens []*Token, token_index int) (AstExpression, int, error) {
@@ -155,6 +103,20 @@ func parse_regexp_literal(regexp_token *Token, regexp string, index int) (AstExp
 		}
 		exp.body = &AstPrimary{start}
 		return exp, idx, nil
+	} else if c == '[' {
+		start, next_index, err := parse_regexp_character_class(regexp_token, regexp, index+1)
+		if err != nil {
+			return nil, next_index, err
+		}
+		exp, idx, err := parse_regexp_quantifier(regexp_token, regexp, next_index)
+		if err != nil {
+			return nil, idx, err
+		}
+		if exp == nil {
+			return start, idx, nil
+		}
+		exp.body = start
+		return exp, idx, nil
 	} else if c == '.' {
 		start = &AstString{true, "\n", false}
 		next_index += 1
@@ -182,6 +144,89 @@ func parse_regexp_literal(regexp_token *Token, regexp string, index int) (AstExp
 	}
 }
 
+func parse_regexp_character_class(regexp_token *Token, regexp string, index int) (AstExpression, int, error) {
+	if index >= len(regexp) {
+		return nil, index, NewParseError(*regexp_token, "Unexpected end of regexp")
+	}
+
+	next_index := index
+	notin := false
+	if regexp[next_index] == '^' {
+		notin = true
+		next_index += 1
+	}
+
+	if next_index >= len(regexp) {
+		return nil, index, NewParseError(*regexp_token, "Unexpected end of regexp")
+	}
+
+	results := []AstListable{}
+	for next_index < len(regexp) && regexp[next_index] != ']' {
+		listable, idx, err := parse_regexp_class_ranges(regexp_token, regexp, next_index)
+		if err != nil {
+			return nil, idx, err
+		}
+		results = append(results, listable)
+		next_index = idx
+	}
+
+	if next_index >= len(regexp) {
+		return nil, next_index, NewParseError(*regexp_token, "Unexpected end of regexp")
+	}
+
+	if len(results) == 0 {
+		results = append(results, &AstCharacterClass{true, ClassAny})
+	}
+
+	next_index += 1
+
+	return &AstList{notin, results}, next_index, nil
+}
+
+func parse_regexp_class_ranges(regexp_token *Token, regexp string, index int) (AstListable, int, error) {
+	if regexp[index] == '\\' {
+		return parse_regexp_class_atom_escape(regexp_token, regexp, index)
+	} else {
+		start, next_index, err := parse_regexp_class_atom_string(regexp_token, regexp, index)
+		if err != nil {
+			return nil, next_index, err
+		}
+
+		if next_index >= len(regexp) {
+			return start, next_index, err
+		}
+
+		if regexp[next_index] == '-' {
+			to, end_index, err := parse_regexp_class_atom_string(regexp_token, regexp, next_index+1)
+			if err != nil {
+				return nil, end_index, err
+			}
+
+			if to == nil {
+				return start, next_index, nil
+			}
+
+			return &AstRange{start, to}, end_index, nil
+		}
+
+		return start, next_index, err
+	}
+}
+
+func parse_regexp_class_atom_escape(regexp_token *Token, regexp string, index int) (AstListable, int, error) {
+	if index+1 >= len(regexp) {
+		return nil, index + 1, NewParseError(*regexp_token, "Unexpected end of regexp")
+	}
+	panic("PARSE ESCAPE CHARACTER")
+}
+
+func parse_regexp_class_atom_string(regexp_token *Token, regexp string, index int) (*AstString, int, error) {
+	if regexp[index] == ']' {
+		return nil, index, nil
+	}
+	return &AstString{false, string(regexp[index]), false}, index + 1, nil
+}
+
 func parse_regexp_quantifier(regexp_token *Token, regexp string, index int) (*AstLoop, int, error) {
 	if index >= len(regexp) {
 		return nil, index, nil

diff --git a/libvore/vore_test.go b/libvore/vore_test.go
@@ -474,3 +474,44 @@ func TestRegexp8(t *testing.T) {
 		{6, "123", None[string](), []TestVar{}},
 	})
 }
+
+func TestRegexp9(t *testing.T) {
+	vore, err := Compile("find all @/[^]*/")
+	checkNoError(t, err)
+	results := vore.Run(`1231231
+	2312`)
+	matches(t, results, []TestMatch{
+		{0, `1231231
+	2312`, None[string](), []TestVar{}},
+	})
+}
+
+func TestRegexp10(t *testing.T) {
+	vore, err := Compile("find all @/[abc]*/")
+	checkNoError(t, err)
+	results := vore.Run(`123aabbcc986`)
+	matches(t, results, []TestMatch{
+		{3, `aabbcc`, None[string](), []TestVar{}},
+	})
+}
+
+func TestRegexp11(t *testing.T) {
+	vore, err := Compile("find all @/[a-z]{0,2}/")
+	checkNoError(t, err)
+	results := vore.Run("IT WILL CATCH this AND it WILL GET me")
+	matches(t, results, []TestMatch{
+		{14, "th", None[string](), []TestVar{}},
+		{16, "is", None[string](), []TestVar{}},
+		{23, "it", None[string](), []TestVar{}},
+		{35, "me", None[string](), []TestVar{}},
+	})
+}
+
+func TestRegexp12(t *testing.T) {
+	vore, err := Compile("find all @/[a-]*/")
+	checkNoError(t, err)
+	results := vore.Run(`123aa--a-ac986`)
+	matches(t, results, []TestMatch{
+		{3, `aa--a-a`, None[string](), []TestVar{}},
+	})
+}