Skip to content

Commit

Permalink
Started parsing character sets
Browse files Browse the repository at this point in the history
  • Loading branch information
jmeaster30 committed Apr 19, 2023
1 parent 70658ae commit c6acc4f
Show file tree
Hide file tree
Showing 2 changed files with 141 additions and 55 deletions.
155 changes: 100 additions & 55 deletions libvore/parser_regexp.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,63 +4,11 @@ import (
"strconv"
)

// This is mostly built off of the ECMAScript spec for regular expressions
// https://262.ecma-international.org/13.0/#sec-patterns
// this was mostly because it was the easiest to find the grammar

/*
This is mostly built off of the ECMAScript spec for regular expressions
https://262.ecma-international.org/13.0/#sec-patterns
PATTERN :: DISJUNCTION
DISJUNCTION :: ALTERNATIVE
:: ALTERNATIVE | DISJUNCTION
ALTERNATIVE :: [empty]
:: ALTERNATIVE TERM
TERM :: ASSERTION
:: ATOM
:: ATOM QUANTIFIER
ASSERTION :: ^
:: $
:: \ b
:: \ B
:: ( ? = DISJUNCTION )
:: ( ? ! DISJUNCTION )
:: ( ? < = DISJUNCTION )
:: ( ? < ! DISJUNCTION )
QUANTIFIER :: QUANTIFIERPREFIX
:: QUANTIFIERPREFIX ?
QUANTIFIERPREFIX :: *
:: +
:: ?
:: { DECIMALDIGITS }
:: { DECIMALDIGITS , }
:: { DECIMALDIGITS , DECIMALDIGITS }
ATOM :: PATTERNCHARACTER
:: .
:: \ ATOMESCAPE
:: CHARACTERCLASS
:: ( GROUPSPECIFIER DISJUNCTION )
:: ( ? : DISJUNCTION )
SYNTAXCHARACTER :: one of ^ $ \ . * + ? ( ) [ ] { }
PATTERNCHARACTER :: any single character except for SYNTAXCHARACTER
ATOMESCAPE :: unimplemented
CHARACTERCLASS :: unimplemented
GROUPSPECIFIER :: [empty]
:: ? GROUPNAME
GROUPNAME :: < IDENTIFIER > -- will break ECMAScript standard here and just do the same style identifiers in vore
I will be making some changes to the grammar but I do want it to be as close to the specification as possible
*/

func parse_regexp(tokens []*Token, token_index int) (AstExpression, int, error) {
Expand Down Expand Up @@ -155,6 +103,20 @@ func parse_regexp_literal(regexp_token *Token, regexp string, index int) (AstExp
}
exp.body = &AstPrimary{start}
return exp, idx, nil
} else if c == '[' {
start, next_index, err := parse_regexp_character_class(regexp_token, regexp, index+1)
if err != nil {
return nil, next_index, err
}
exp, idx, err := parse_regexp_quantifier(regexp_token, regexp, next_index)
if err != nil {
return nil, idx, err
}
if exp == nil {
return start, idx, nil
}
exp.body = start
return exp, idx, nil
} else if c == '.' {
start = &AstString{true, "\n", false}
next_index += 1
Expand Down Expand Up @@ -182,6 +144,89 @@ func parse_regexp_literal(regexp_token *Token, regexp string, index int) (AstExp
}
}

func parse_regexp_character_class(regexp_token *Token, regexp string, index int) (AstExpression, int, error) {
if index >= len(regexp) {
return nil, index, NewParseError(*regexp_token, "Unexpected end of regexp")
}

next_index := index
notin := false
if regexp[next_index] == '^' {
notin = true
next_index += 1
}

if next_index >= len(regexp) {
return nil, index, NewParseError(*regexp_token, "Unexpected end of regexp")
}

results := []AstListable{}
for next_index < len(regexp) && regexp[next_index] != ']' {
listable, idx, err := parse_regexp_class_ranges(regexp_token, regexp, next_index)
if err != nil {
return nil, idx, err
}
results = append(results, listable)
next_index = idx
}

if next_index >= len(regexp) {
return nil, next_index, NewParseError(*regexp_token, "Unexpected end of regexp")
}

if len(results) == 0 {
results = append(results, &AstCharacterClass{true, ClassAny})
}

next_index += 1

return &AstList{notin, results}, next_index, nil
}

func parse_regexp_class_ranges(regexp_token *Token, regexp string, index int) (AstListable, int, error) {
if regexp[index] == '\\' {
return parse_regexp_class_atom_escape(regexp_token, regexp, index)
} else {
start, next_index, err := parse_regexp_class_atom_string(regexp_token, regexp, index)
if err != nil {
return nil, next_index, err
}

if next_index >= len(regexp) {
return start, next_index, err
}

if regexp[next_index] == '-' {
to, end_index, err := parse_regexp_class_atom_string(regexp_token, regexp, next_index+1)
if err != nil {
return nil, end_index, err
}

if to == nil {
return start, next_index, nil
}

return &AstRange{start, to}, end_index, nil
}

return start, next_index, err
}
}

func parse_regexp_class_atom_escape(regexp_token *Token, regexp string, index int) (AstListable, int, error) {
if index+1 >= len(regexp) {
return nil, index + 1, NewParseError(*regexp_token, "Unexpected end of regexp")
}
panic("PARSE ESCAPE CHARACTER")
}

func parse_regexp_class_atom_string(regexp_token *Token, regexp string, index int) (*AstString, int, error) {
if regexp[index] == ']' {
return nil, index, nil
}
return &AstString{false, string(regexp[index]), false}, index + 1, nil
}

func parse_regexp_quantifier(regexp_token *Token, regexp string, index int) (*AstLoop, int, error) {
if index >= len(regexp) {
return nil, index, nil
Expand Down
41 changes: 41 additions & 0 deletions libvore/vore_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -474,3 +474,44 @@ func TestRegexp8(t *testing.T) {
{6, "123", None[string](), []TestVar{}},
})
}

func TestRegexp9(t *testing.T) {
vore, err := Compile("find all @/[^]*/")
checkNoError(t, err)
results := vore.Run(`1231231
2312`)
matches(t, results, []TestMatch{
{0, `1231231
2312`, None[string](), []TestVar{}},
})
}

func TestRegexp10(t *testing.T) {
vore, err := Compile("find all @/[abc]*/")
checkNoError(t, err)
results := vore.Run(`123aabbcc986`)
matches(t, results, []TestMatch{
{3, `aabbcc`, None[string](), []TestVar{}},
})
}

func TestRegexp11(t *testing.T) {
vore, err := Compile("find all @/[a-z]{0,2}/")
checkNoError(t, err)
results := vore.Run("IT WILL CATCH this AND it WILL GET me")
matches(t, results, []TestMatch{
{14, "th", None[string](), []TestVar{}},
{16, "is", None[string](), []TestVar{}},
{23, "it", None[string](), []TestVar{}},
{35, "me", None[string](), []TestVar{}},
})
}

func TestRegexp12(t *testing.T) {
vore, err := Compile("find all @/[a-]*/")
checkNoError(t, err)
results := vore.Run(`123aa--a-ac986`)
matches(t, results, []TestMatch{
{3, `aa--a-a`, None[string](), []TestVar{}},
})
}

0 comments on commit c6acc4f

Please sign in to comment.