From 15b2026f05a2237e081817635910d79a28265d35 Mon Sep 17 00:00:00 2001 From: jmeaster30 Date: Sun, 16 Apr 2023 04:29:29 -0400 Subject: [PATCH] Started adding normal regex --- .../language-configuration.json | 6 +- .../syntaxes/vore.tmLanguage.json | 8 + libvore/lexer.go | 23 +- libvore/lexer_test.go | 5 +- libvore/parser.go | 2 + libvore/parser_regexp.go | 227 ++++++++++++++++++ libvore/vore.go | 2 +- libvore/vore_test.go | 82 +++++++ main.go | 2 + 9 files changed, 351 insertions(+), 6 deletions(-) create mode 100644 libvore/parser_regexp.go diff --git a/libvore-syntax-highlighter/language-configuration.json b/libvore-syntax-highlighter/language-configuration.json index 37f4f91..238012b 100644 --- a/libvore-syntax-highlighter/language-configuration.json +++ b/libvore-syntax-highlighter/language-configuration.json @@ -11,12 +11,14 @@ ["{", "}"], ["(", ")"], ["\"", "\""], - ["'", "'"] + ["'", "'"], + ["@/", "/"] ], "surroundingPairs": [ ["{", "}"], ["(", ")"], ["\"", "\""], - ["'", "'"] + ["'", "'"], + ["@/", "/"] ] } \ No newline at end of file diff --git a/libvore-syntax-highlighter/syntaxes/vore.tmLanguage.json b/libvore-syntax-highlighter/syntaxes/vore.tmLanguage.json index 16b054f..b58b02b 100644 --- a/libvore-syntax-highlighter/syntaxes/vore.tmLanguage.json +++ b/libvore-syntax-highlighter/syntaxes/vore.tmLanguage.json @@ -28,6 +28,9 @@ }, { "include": "#sinstrings" + }, + { + "include": "#regexp" } ], "repository": { @@ -99,6 +102,11 @@ "match": "\\\\." } ] + }, + "regexp": { + "name": "constant.regexp", + "begin": "@/", + "end": "/" } }, "scopeName": "source.vore" diff --git a/libvore/lexer.go b/libvore/lexer.go index cb1e189..9386ab1 100644 --- a/libvore/lexer.go +++ b/libvore/lexer.go @@ -22,6 +22,7 @@ const ( IDENTIFIER NUMBER STRING + REGEXP // misc EQUAL @@ -257,6 +258,8 @@ func (t TokenType) PP() string { return "FALSE" case CASELESS: return "CASELESS" + case REGEXP: + return "REGEXP" default: panic("UNKNOWN TOKEN TYPE") } @@ -350,6 +353,7 @@ func (s *Lexer) getNextToken() (*Token, error) { SDASH SOPERATOR SOPERATORSTART + SREGEXP SERROR SEND ) @@ -527,8 +531,23 @@ func (s *Lexer) getNextToken() (*Token, error) { } else if current_state == SCOMMENTSTART { s.unread_last() current_state = SCOMMENT + } else if current_state == SSTART && ch == '@' { + next_ch := s.read() + if next_ch != '/' { + s.unread_last() + current_state = SERROR + break + } + curr_ch := s.read() + for curr_ch != '/' { + buf.WriteRune(curr_ch) + curr_ch = s.read() + } + + current_state = SREGEXP + break } else { - if current_state != SSTART || unicode.IsDigit(ch) || unicode.IsLetter(ch) || unicode.IsSpace(ch) || ch == '(' || ch == ')' || ch == '{' || ch == '}' || ch == ',' || ch == ':' || ch == '=' || ch == '"' || ch == '\'' || ch == '-' || ch == '+' || ch == '<' || ch == '>' || ch == '*' || ch == '/' || ch == '%' { + if current_state != SSTART || unicode.IsDigit(ch) || unicode.IsLetter(ch) || unicode.IsSpace(ch) || ch == '(' || ch == ')' || ch == '{' || ch == '}' || ch == ',' || ch == ':' || ch == '=' || ch == '"' || ch == '\'' || ch == '-' || ch == '+' || ch == '<' || ch == '>' || ch == '*' || ch == '/' || ch == '%' || ch == '@' { s.unread_last() } else { buf.WriteRune(ch) @@ -555,6 +574,8 @@ func (s *Lexer) getNextToken() (*Token, error) { token.TokenType = STRING case SNUMBER: token.TokenType = NUMBER + case SREGEXP: + token.TokenType = REGEXP case SIDENTIFIER: token.TokenType = IDENTIFIER lexeme := strings.ToLower(buf.String()) diff --git a/libvore/lexer_test.go b/libvore/lexer_test.go index f8c8d82..a887bbc 100644 --- a/libvore/lexer_test.go +++ b/libvore/lexer_test.go @@ -107,10 +107,10 @@ func TestCheckUnendingBlockCommentError2(t *testing.T) { } func TestCheckUnknownToken(t *testing.T) { - lexer := initLexer(strings.NewReader("ident @")) + lexer := initLexer(strings.NewReader("ident $")) tokens, err := lexer.getTokens() - checkVoreErrorToken(t, err, "LexError", ERROR, "@", 6, 7, "Unknown token :(") + checkVoreErrorToken(t, err, "LexError", ERROR, "$", 6, 7, "Unknown token :(") if len(tokens) != 0 { t.Errorf("Expected no tokens returned on error. Got %d tokens", len(tokens)) @@ -200,4 +200,5 @@ func TestTokenTypePP(t *testing.T) { ppMatch(t, FALSE, "FALSE") ppMatch(t, WHOLE, "WHOLE") ppMatch(t, CASELESS, "CASELESS") + ppMatch(t, REGEXP, "REGEXP") } diff --git a/libvore/parser.go b/libvore/parser.go index 1ba3e0d..100dbc4 100644 --- a/libvore/parser.go +++ b/libvore/parser.go @@ -304,6 +304,8 @@ func parse_expression(tokens []*Token, token_index int) (AstExpression, int, err return parse_subroutine(tokens, token_index) } else if current_token.TokenType == NOT { return parse_not_expression(tokens, token_index) + } else if current_token.TokenType == REGEXP { + return parse_regexp(tokens, token_index) } else if current_token.TokenType == STRING || current_token.TokenType == IDENTIFIER || current_token.TokenType == OPENPAREN || current_token.TokenType == ANY || current_token.TokenType == WHITESPACE || current_token.TokenType == DIGIT || diff --git a/libvore/parser_regexp.go b/libvore/parser_regexp.go new file mode 100644 index 0000000..32d6fc8 --- /dev/null +++ b/libvore/parser_regexp.go @@ -0,0 +1,227 @@ +package libvore + +import ( + "strconv" +) + +// This is mostly built off of the ECMAScript spec for regular expressions +// https://262.ecma-international.org/13.0/#sec-patterns +// this was mostly because it was the easiest to find the grammar + +/* + +PATTERN :: DISJUNCTION + +DISJUNCTION :: ALTERNATIVE + :: ALTERNATIVE | DISJUNCTION + +ALTERNATIVE :: [empty] + :: ALTERNATIVE TERM + +TERM :: ASSERTION + :: ATOM + :: ATOM QUANTIFIER + +ASSERTION :: ^ + :: $ + :: \ b + :: \ B + :: ( ? = DISJUNCTION ) + :: ( ? ! DISJUNCTION ) + :: ( ? < = DISJUNCTION ) + :: ( ? < ! DISJUNCTION ) + +QUANTIFIER :: QUANTIFIERPREFIX + :: QUANTIFIERPREFIX ? + +QUANTIFIERPREFIX :: * + :: + + :: ? + :: { DECIMALDIGITS } + :: { DECIMALDIGITS , } + :: { DECIMALDIGITS , DECIMALDIGITS } + +ATOM :: PATTERNCHARACTER + :: . + :: \ ATOMESCAPE + :: CHARACTERCLASS + :: ( GROUPSPECIFIER DISJUNCTION ) + :: ( ? : DISJUNCTION ) + +SYNTAXCHARACTER :: one of ^ $ \ . * + ? ( ) [ ] { } + +PATTERNCHARACTER :: any single character except for SYNTAXCHARACTER + +ATOMESCAPE :: unimplemented + +CHARACTERCLASS :: unimplemented + +GROUPSPECIFIER :: [empty] + :: ? GROUPNAME + +GROUPNAME :: < IDENTIFIER > -- will break ECMAScript standard here and just do the same style identifiers in vore + +*/ + +func parse_regexp(tokens []*Token, token_index int) (AstExpression, int, error) { + regexp_token := tokens[token_index] + regexp := regexp_token.Lexeme + + index := 0 + results := []AstExpression{} + for index < len(regexp) { + exp, next_index, err := parse_regexp_pattern(regexp_token, regexp, index) + if err != nil { + return nil, next_index, err + } + results = append(results, exp) + index = next_index + } + + s := &AstPrimary{ + &AstSubExpr{results}, + } + + return s, token_index + 1, nil +} + +func parse_regexp_pattern(regexp_token *Token, regexp string, index int) (AstExpression, int, error) { + + start, next_index, err := parse_regexp_literal(regexp_token, regexp, index) + if err != nil { + return nil, next_index, err + } + + if next_index >= len(regexp) { + return &AstPrimary{start}, next_index, nil + } + + op := regexp[next_index] + if op == '*' { + fewest := next_index+1 < len(regexp) && regexp[next_index+1] == '?' + exp := &AstLoop{0, -1, fewest, &AstPrimary{start}, ""} + if fewest { + return exp, next_index + 2, nil + } else { + return exp, next_index + 1, nil + } + } else if op == '+' { + fewest := next_index+1 < len(regexp) && regexp[next_index+1] == '?' + exp := &AstLoop{1, -1, fewest, &AstPrimary{start}, ""} + if fewest { + return exp, next_index + 2, nil + } else { + return exp, next_index + 1, nil + } + } else if op == '?' { + fewest := next_index+1 < len(regexp) && regexp[next_index+1] == '?' + exp := &AstLoop{0, 1, fewest, &AstPrimary{start}, ""} + if fewest { + return exp, next_index + 2, nil + } else { + return exp, next_index + 1, nil + } + } else if op == '{' { + from, idx, err := parse_regexp_number(regexp_token, regexp, next_index+1) + if err != nil { + return nil, idx, err + } + comma_or_brace := regexp[idx] + var end_idx int + var exp *AstLoop + if comma_or_brace == ',' { + if regexp[idx+1] == '}' { + exp = &AstLoop{from, -1, false, &AstPrimary{start}, ""} + end_idx = idx + 1 + } else { + to, idx2, err := parse_regexp_number(regexp_token, regexp, idx+1) + if err != nil { + return nil, idx, err + } + brace := regexp[idx2] + if brace != '}' { + return nil, idx2, NewParseError(*regexp_token, "Unexpected character. Expected '}'") + } + + exp = &AstLoop{from, to, false, &AstPrimary{start}, ""} + end_idx = idx2 + 1 + } + } else if comma_or_brace == '}' { + exp = &AstLoop{from, from, false, &AstPrimary{start}, ""} + end_idx = idx + 1 + } + exp.fewest = end_idx+1 < len(regexp) && regexp[end_idx+1] == '?' + if exp.fewest { + return exp, end_idx + 2, nil + } else { + return exp, end_idx + 1, nil + } + } else if op == '|' { + end, idx, err := parse_regexp_pattern(regexp_token, regexp, next_index+1) + return &AstBranch{start, end}, idx, err + } + + return &AstPrimary{start}, next_index, nil +} + +func parse_regexp_number(regexp_token *Token, regexp string, index int) (int, int, error) { + c := regexp[index] + result := "" + idx := index + for c >= '0' && c <= '9' { + result += string(c) + idx += 1 + c = regexp[idx] + } + if result == "" { + return -1, index, NewParseError(*regexp_token, "Unexpected Token. Expected number") + } + value, err := strconv.Atoi(result) + if err != nil { + return value, idx, NewParseError(*regexp_token, "Error converting string to number") + } + return value, idx, nil + +} + +func parse_regexp_literal(regexp_token *Token, regexp string, index int) (AstLiteral, int, error) { + c := regexp[index] + var start AstLiteral + next_index := index + if c == '^' { + start = &AstCharacterClass{false, ClassLineStart} + next_index += 1 + } else if c == '$' { + start = &AstCharacterClass{false, ClassLineEnd} + next_index += 1 + } else if c == '\\' { + exp, idx, err := parse_regexp_escape_characters(regexp_token, regexp, index+1) + if err != nil { + return nil, index, err + } + start = exp + next_index = idx + } else if c == '(' { + exp, idx, err := parse_regexp_groups(regexp_token, regexp, index+1) + if err != nil { + return nil, index, err + } + start = exp + next_index = idx + } else if c == '.' { + start = &AstString{true, "\n", false} + next_index += 1 + } else { + start = &AstString{false, string(c), false} + next_index += 1 + } + return start, next_index, nil +} + +func parse_regexp_escape_characters(regexp_token *Token, regexp string, index int) (AstLiteral, int, error) { + return nil, index, nil +} + +func parse_regexp_groups(regexp_token *Token, regexp string, index int) (AstLiteral, int, error) { + return nil, index, nil +} diff --git a/libvore/vore.go b/libvore/vore.go index 7754d39..fa94396 100644 --- a/libvore/vore.go +++ b/libvore/vore.go @@ -293,7 +293,7 @@ func printString(str ValueString) { func (m Match) Print() { fmt.Printf("Filename: %s\n", m.Filename) fmt.Printf("MatchNumber: %d\n", m.MatchNumber) - fmt.Printf("Value: %s\n", m.Value) + fmt.Printf("Value: '%s'\n", m.Value) if m.Replacement.HasValue() { fmt.Printf("Replaced: %s\n", m.Replacement.GetValue()) } diff --git a/libvore/vore_test.go b/libvore/vore_test.go index 12e7fa3..3a51811 100644 --- a/libvore/vore_test.go +++ b/libvore/vore_test.go @@ -381,3 +381,85 @@ func TestCaseless(t *testing.T) { {64, "tEsT", None[string](), []TestVar{}}, }) } + +func TestRegexp(t *testing.T) { + vore, err := Compile("find all @/a+b*/") + checkNoError(t, err) + results := vore.Run("aaabbb ab a") + matches(t, results, []TestMatch{ + {0, "aaabbb", None[string](), []TestVar{}}, + {7, "ab", None[string](), []TestVar{}}, + {10, "a", None[string](), []TestVar{}}, + }) +} + +func TestRegexp2(t *testing.T) { + vore, err := Compile("find all @/a*?b/") + checkNoError(t, err) + results := vore.Run("aaabbb ab a") + matches(t, results, []TestMatch{ + {0, "aaab", None[string](), []TestVar{}}, + {4, "b", None[string](), []TestVar{}}, + {5, "b", None[string](), []TestVar{}}, + {7, "ab", None[string](), []TestVar{}}, + }) +} + +func TestRegexp3(t *testing.T) { + vore, err := Compile("find all @/a+?b?/") + checkNoError(t, err) + results := vore.Run("aaabbb ab a") + matches(t, results, []TestMatch{ + {0, "a", None[string](), []TestVar{}}, + {1, "a", None[string](), []TestVar{}}, + {2, "ab", None[string](), []TestVar{}}, + {7, "ab", None[string](), []TestVar{}}, + {10, "a", None[string](), []TestVar{}}, + }) +} + +func TestRegexp4(t *testing.T) { + vore, err := Compile("find all @/a{4,7}/") + checkNoError(t, err) + results := vore.Run(`aaaaaaaa + aaa aaaaaa`) + matches(t, results, []TestMatch{ + {0, "aaaaaaa", None[string](), []TestVar{}}, + {14, "aaaaaa", None[string](), []TestVar{}}, + }) +} + +func TestRegexp5(t *testing.T) { + vore, err := Compile("find all @/a{4,}/") + checkNoError(t, err) + results := vore.Run(`aaaaaaaa + aaa aaaaaa`) + matches(t, results, []TestMatch{ + {0, "aaaaaaaa", None[string](), []TestVar{}}, + {14, "aaaaaa", None[string](), []TestVar{}}, + }) +} + +func TestRegexp6(t *testing.T) { + vore, err := Compile("find all @/a{4}/") + checkNoError(t, err) + results := vore.Run(`aaaaaaaa + aaa aaaaaa`) + matches(t, results, []TestMatch{ + {0, "aaaa", None[string](), []TestVar{}}, + {4, "aaaa", None[string](), []TestVar{}}, + {14, "aaaa", None[string](), []TestVar{}}, + }) +} + +func TestRegexp7(t *testing.T) { + vore, err := Compile("find all @/a{4,}?/") + checkNoError(t, err) + results := vore.Run(`aaaaaaaa + aaa aaaaaa`) + matches(t, results, []TestMatch{ + {0, "aaaa", None[string](), []TestVar{}}, + {4, "aaaa", None[string](), []TestVar{}}, + {14, "aaaa", None[string](), []TestVar{}}, + }) +} diff --git a/main.go b/main.go index a9c1a55..4e9e1fd 100644 --- a/main.go +++ b/main.go @@ -100,6 +100,8 @@ func main() { os.Exit(1) } + //vore.PrintAST() + results := vore.RunFiles([]string{*files_arg}, replaceModeArg, process_filenames) if no_output { // skip all output