Skip to content

Commit

Permalink
Started adding normal regex
Browse files Browse the repository at this point in the history
  • Loading branch information
jmeaster30 committed Apr 16, 2023
1 parent fb38ff2 commit 15b2026
Show file tree
Hide file tree
Showing 9 changed files with 351 additions and 6 deletions.
6 changes: 4 additions & 2 deletions libvore-syntax-highlighter/language-configuration.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,14 @@
["{", "}"],
["(", ")"],
["\"", "\""],
["'", "'"]
["'", "'"],
["@/", "/"]
],
"surroundingPairs": [
["{", "}"],
["(", ")"],
["\"", "\""],
["'", "'"]
["'", "'"],
["@/", "/"]
]
}
8 changes: 8 additions & 0 deletions libvore-syntax-highlighter/syntaxes/vore.tmLanguage.json
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@
},
{
"include": "#sinstrings"
},
{
"include": "#regexp"
}
],
"repository": {
Expand Down Expand Up @@ -99,6 +102,11 @@
"match": "\\\\."
}
]
},
"regexp": {
"name": "constant.regexp",
"begin": "@/",
"end": "/"
}
},
"scopeName": "source.vore"
Expand Down
23 changes: 22 additions & 1 deletion libvore/lexer.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ const (
IDENTIFIER
NUMBER
STRING
REGEXP

// misc
EQUAL
Expand Down Expand Up @@ -257,6 +258,8 @@ func (t TokenType) PP() string {
return "FALSE"
case CASELESS:
return "CASELESS"
case REGEXP:
return "REGEXP"
default:
panic("UNKNOWN TOKEN TYPE")
}
Expand Down Expand Up @@ -350,6 +353,7 @@ func (s *Lexer) getNextToken() (*Token, error) {
SDASH
SOPERATOR
SOPERATORSTART
SREGEXP
SERROR
SEND
)
Expand Down Expand Up @@ -527,8 +531,23 @@ func (s *Lexer) getNextToken() (*Token, error) {
} else if current_state == SCOMMENTSTART {
s.unread_last()
current_state = SCOMMENT
} else if current_state == SSTART && ch == '@' {
next_ch := s.read()
if next_ch != '/' {
s.unread_last()
current_state = SERROR
break
}
curr_ch := s.read()
for curr_ch != '/' {
buf.WriteRune(curr_ch)
curr_ch = s.read()
}

current_state = SREGEXP
break
} else {
if current_state != SSTART || unicode.IsDigit(ch) || unicode.IsLetter(ch) || unicode.IsSpace(ch) || ch == '(' || ch == ')' || ch == '{' || ch == '}' || ch == ',' || ch == ':' || ch == '=' || ch == '"' || ch == '\'' || ch == '-' || ch == '+' || ch == '<' || ch == '>' || ch == '*' || ch == '/' || ch == '%' {
if current_state != SSTART || unicode.IsDigit(ch) || unicode.IsLetter(ch) || unicode.IsSpace(ch) || ch == '(' || ch == ')' || ch == '{' || ch == '}' || ch == ',' || ch == ':' || ch == '=' || ch == '"' || ch == '\'' || ch == '-' || ch == '+' || ch == '<' || ch == '>' || ch == '*' || ch == '/' || ch == '%' || ch == '@' {
s.unread_last()
} else {
buf.WriteRune(ch)
Expand All @@ -555,6 +574,8 @@ func (s *Lexer) getNextToken() (*Token, error) {
token.TokenType = STRING
case SNUMBER:
token.TokenType = NUMBER
case SREGEXP:
token.TokenType = REGEXP
case SIDENTIFIER:
token.TokenType = IDENTIFIER
lexeme := strings.ToLower(buf.String())
Expand Down
5 changes: 3 additions & 2 deletions libvore/lexer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,10 +107,10 @@ func TestCheckUnendingBlockCommentError2(t *testing.T) {
}

func TestCheckUnknownToken(t *testing.T) {
lexer := initLexer(strings.NewReader("ident @"))
lexer := initLexer(strings.NewReader("ident $"))
tokens, err := lexer.getTokens()

checkVoreErrorToken(t, err, "LexError", ERROR, "@", 6, 7, "Unknown token :(")
checkVoreErrorToken(t, err, "LexError", ERROR, "$", 6, 7, "Unknown token :(")

if len(tokens) != 0 {
t.Errorf("Expected no tokens returned on error. Got %d tokens", len(tokens))
Expand Down Expand Up @@ -200,4 +200,5 @@ func TestTokenTypePP(t *testing.T) {
ppMatch(t, FALSE, "FALSE")
ppMatch(t, WHOLE, "WHOLE")
ppMatch(t, CASELESS, "CASELESS")
ppMatch(t, REGEXP, "REGEXP")
}
2 changes: 2 additions & 0 deletions libvore/parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,8 @@ func parse_expression(tokens []*Token, token_index int) (AstExpression, int, err
return parse_subroutine(tokens, token_index)
} else if current_token.TokenType == NOT {
return parse_not_expression(tokens, token_index)
} else if current_token.TokenType == REGEXP {
return parse_regexp(tokens, token_index)
} else if current_token.TokenType == STRING || current_token.TokenType == IDENTIFIER ||
current_token.TokenType == OPENPAREN || current_token.TokenType == ANY ||
current_token.TokenType == WHITESPACE || current_token.TokenType == DIGIT ||
Expand Down
227 changes: 227 additions & 0 deletions libvore/parser_regexp.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,227 @@
package libvore

import (
"strconv"
)

// This is mostly built off of the ECMAScript spec for regular expressions
// https://262.ecma-international.org/13.0/#sec-patterns
// this was mostly because it was the easiest to find the grammar

/*
PATTERN :: DISJUNCTION
DISJUNCTION :: ALTERNATIVE
:: ALTERNATIVE | DISJUNCTION
ALTERNATIVE :: [empty]
:: ALTERNATIVE TERM
TERM :: ASSERTION
:: ATOM
:: ATOM QUANTIFIER
ASSERTION :: ^
:: $
:: \ b
:: \ B
:: ( ? = DISJUNCTION )
:: ( ? ! DISJUNCTION )
:: ( ? < = DISJUNCTION )
:: ( ? < ! DISJUNCTION )
QUANTIFIER :: QUANTIFIERPREFIX
:: QUANTIFIERPREFIX ?
QUANTIFIERPREFIX :: *
:: +
:: ?
:: { DECIMALDIGITS }
:: { DECIMALDIGITS , }
:: { DECIMALDIGITS , DECIMALDIGITS }
ATOM :: PATTERNCHARACTER
:: .
:: \ ATOMESCAPE
:: CHARACTERCLASS
:: ( GROUPSPECIFIER DISJUNCTION )
:: ( ? : DISJUNCTION )
SYNTAXCHARACTER :: one of ^ $ \ . * + ? ( ) [ ] { }
PATTERNCHARACTER :: any single character except for SYNTAXCHARACTER
ATOMESCAPE :: unimplemented
CHARACTERCLASS :: unimplemented
GROUPSPECIFIER :: [empty]
:: ? GROUPNAME
GROUPNAME :: < IDENTIFIER > -- will break ECMAScript standard here and just do the same style identifiers in vore
*/

func parse_regexp(tokens []*Token, token_index int) (AstExpression, int, error) {
regexp_token := tokens[token_index]
regexp := regexp_token.Lexeme

index := 0
results := []AstExpression{}
for index < len(regexp) {
exp, next_index, err := parse_regexp_pattern(regexp_token, regexp, index)
if err != nil {
return nil, next_index, err
}
results = append(results, exp)
index = next_index
}

s := &AstPrimary{
&AstSubExpr{results},
}

return s, token_index + 1, nil
}

func parse_regexp_pattern(regexp_token *Token, regexp string, index int) (AstExpression, int, error) {

start, next_index, err := parse_regexp_literal(regexp_token, regexp, index)
if err != nil {
return nil, next_index, err
}

if next_index >= len(regexp) {
return &AstPrimary{start}, next_index, nil
}

op := regexp[next_index]
if op == '*' {
fewest := next_index+1 < len(regexp) && regexp[next_index+1] == '?'
exp := &AstLoop{0, -1, fewest, &AstPrimary{start}, ""}
if fewest {
return exp, next_index + 2, nil
} else {
return exp, next_index + 1, nil
}
} else if op == '+' {
fewest := next_index+1 < len(regexp) && regexp[next_index+1] == '?'
exp := &AstLoop{1, -1, fewest, &AstPrimary{start}, ""}
if fewest {
return exp, next_index + 2, nil
} else {
return exp, next_index + 1, nil
}
} else if op == '?' {
fewest := next_index+1 < len(regexp) && regexp[next_index+1] == '?'
exp := &AstLoop{0, 1, fewest, &AstPrimary{start}, ""}
if fewest {
return exp, next_index + 2, nil
} else {
return exp, next_index + 1, nil
}
} else if op == '{' {
from, idx, err := parse_regexp_number(regexp_token, regexp, next_index+1)
if err != nil {
return nil, idx, err
}
comma_or_brace := regexp[idx]
var end_idx int
var exp *AstLoop
if comma_or_brace == ',' {
if regexp[idx+1] == '}' {
exp = &AstLoop{from, -1, false, &AstPrimary{start}, ""}
end_idx = idx + 1
} else {
to, idx2, err := parse_regexp_number(regexp_token, regexp, idx+1)
if err != nil {
return nil, idx, err
}
brace := regexp[idx2]
if brace != '}' {
return nil, idx2, NewParseError(*regexp_token, "Unexpected character. Expected '}'")
}

exp = &AstLoop{from, to, false, &AstPrimary{start}, ""}
end_idx = idx2 + 1
}
} else if comma_or_brace == '}' {
exp = &AstLoop{from, from, false, &AstPrimary{start}, ""}
end_idx = idx + 1
}
exp.fewest = end_idx+1 < len(regexp) && regexp[end_idx+1] == '?'
if exp.fewest {
return exp, end_idx + 2, nil
} else {
return exp, end_idx + 1, nil
}
} else if op == '|' {
end, idx, err := parse_regexp_pattern(regexp_token, regexp, next_index+1)
return &AstBranch{start, end}, idx, err
}

return &AstPrimary{start}, next_index, nil
}

func parse_regexp_number(regexp_token *Token, regexp string, index int) (int, int, error) {
c := regexp[index]
result := ""
idx := index
for c >= '0' && c <= '9' {
result += string(c)
idx += 1
c = regexp[idx]
}
if result == "" {
return -1, index, NewParseError(*regexp_token, "Unexpected Token. Expected number")
}
value, err := strconv.Atoi(result)
if err != nil {
return value, idx, NewParseError(*regexp_token, "Error converting string to number")
}
return value, idx, nil

}

func parse_regexp_literal(regexp_token *Token, regexp string, index int) (AstLiteral, int, error) {
c := regexp[index]
var start AstLiteral
next_index := index
if c == '^' {
start = &AstCharacterClass{false, ClassLineStart}
next_index += 1
} else if c == '$' {
start = &AstCharacterClass{false, ClassLineEnd}
next_index += 1
} else if c == '\\' {
exp, idx, err := parse_regexp_escape_characters(regexp_token, regexp, index+1)
if err != nil {
return nil, index, err
}
start = exp
next_index = idx
} else if c == '(' {
exp, idx, err := parse_regexp_groups(regexp_token, regexp, index+1)
if err != nil {
return nil, index, err
}
start = exp
next_index = idx
} else if c == '.' {
start = &AstString{true, "\n", false}
next_index += 1
} else {
start = &AstString{false, string(c), false}
next_index += 1
}
return start, next_index, nil
}

func parse_regexp_escape_characters(regexp_token *Token, regexp string, index int) (AstLiteral, int, error) {
return nil, index, nil
}

func parse_regexp_groups(regexp_token *Token, regexp string, index int) (AstLiteral, int, error) {
return nil, index, nil
}
2 changes: 1 addition & 1 deletion libvore/vore.go
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,7 @@ func printString(str ValueString) {
func (m Match) Print() {
fmt.Printf("Filename: %s\n", m.Filename)
fmt.Printf("MatchNumber: %d\n", m.MatchNumber)
fmt.Printf("Value: %s\n", m.Value)
fmt.Printf("Value: '%s'\n", m.Value)
if m.Replacement.HasValue() {
fmt.Printf("Replaced: %s\n", m.Replacement.GetValue())
}
Expand Down
Loading

0 comments on commit 15b2026

Please sign in to comment.