From 0c2daf4075eefd48fa0cde2ff8937328cc5760d8 Mon Sep 17 00:00:00 2001 From: Mansur Date: Thu, 28 Sep 2023 01:35:55 -0600 Subject: [PATCH] minor updates --- nfa.go | 56 +++++++--------- parse.go | 193 ++++++++++++++++++++++++++++--------------------------- util.go | 2 + 3 files changed, 125 insertions(+), 126 deletions(-) diff --git a/nfa.go b/nfa.go index 32c791e..d97f93a 100644 --- a/nfa.go +++ b/nfa.go @@ -33,12 +33,12 @@ const ( newline = 10 ) -func toNfa(memory *parsingContext) (*State, *RegexError) { +func toNfa(parseCtx *parsingContext) (*State, *RegexError) { startFrom := 0 - endAt := len(memory.tokens) - 1 + endAt := len(parseCtx.tokens) - 1 - token := memory.tokens[startFrom] - startState, endState, err := tokenToNfa(token, memory, &State{ + token := parseCtx.tokens[startFrom] + startState, endState, err := tokenToNfa(token, parseCtx, &State{ transitions: map[uint8][]*State{}, }) @@ -47,7 +47,7 @@ func toNfa(memory *parsingContext) (*State, *RegexError) { } for i := startFrom + 1; i <= endAt; i++ { - _, endNext, err := tokenToNfa(memory.tokens[i], memory, endState) + _, endNext, err := tokenToNfa(parseCtx.tokens[i], parseCtx, endState) if err != nil { return nil, err } @@ -83,7 +83,7 @@ func toNfa(memory *parsingContext) (*State, *RegexError) { return start, nil } -func tokenToNfa(token regexToken, memory *parsingContext, startFrom *State) (*State, *State, *RegexError) { +func tokenToNfa(token regexToken, parseCtx *parsingContext, startFrom *State) (*State, *State, *RegexError) { switch token.tokenType { case literal: value := token.value.(uint8) @@ -93,22 +93,20 @@ func tokenToNfa(token regexToken, memory *parsingContext, startFrom *State) (*St startFrom.transitions[value] = []*State{to} return startFrom, to, nil case quantifier: - return handleQuantifierToToken(token, memory, startFrom) + return handleQuantifierToToken(token, parseCtx, startFrom) case wildcard: to := &State{ transitions: map[uint8][]*State{}, } - startFrom.transitions[anyChar] = []*State{to} - return startFrom, to, nil case or: values := token.value.([]regexToken) - _, end1, err := tokenToNfa(values[0], memory, startFrom) + _, end1, err := tokenToNfa(values[0], parseCtx, startFrom) if err != nil { return nil, nil, err } - _, end2, err := tokenToNfa(values[1], memory, startFrom) + _, end2, err := tokenToNfa(values[1], parseCtx, startFrom) if err != nil { return nil, nil, err } @@ -125,7 +123,7 @@ func tokenToNfa(token regexToken, memory *parsingContext, startFrom *State) (*St v := token.value.(groupTokenPayload) // concatenate all the elements in the group - start, end, err := tokenToNfa(v.tokens[0], memory, &State{ + start, end, err := tokenToNfa(v.tokens[0], parseCtx, &State{ transitions: map[uint8][]*State{}, }) @@ -134,7 +132,7 @@ func tokenToNfa(token regexToken, memory *parsingContext, startFrom *State) (*St } for i := 1; i < len(v.tokens); i++ { - _, endNext, err := tokenToNfa(v.tokens[i], memory, end) + _, endNext, err := tokenToNfa(v.tokens[i], parseCtx, end) if err != nil { return nil, nil, err } @@ -142,14 +140,14 @@ func tokenToNfa(token regexToken, memory *parsingContext, startFrom *State) (*St } // concatenation ends - groupNameNumeric := fmt.Sprintf("%d", memory.nextGroup()) + groupNameNumeric := fmt.Sprintf("%d", parseCtx.nextGroup()) groupNameUserSet := v.name groupNames := []string{groupNameNumeric} - memory.capturedGroups[groupNameNumeric] = true + parseCtx.capturedGroups[groupNameNumeric] = true if groupNameUserSet != "" { groupNames = append(groupNames, groupNameUserSet) - memory.capturedGroups[groupNameUserSet] = true + parseCtx.capturedGroups[groupNameUserSet] = true } if startFrom.groups != nil { @@ -186,11 +184,11 @@ func tokenToNfa(token regexToken, memory *parsingContext, startFrom *State) (*St transitions: map[uint8][]*State{}, } - startFrom.transitions[epsilonChar] = append(startFrom.transitions[epsilonChar], end) + startFrom.transitions[epsilonChar] = append(startFrom.transitions[epsilonChar], startFrom) return startFrom, end, nil } - start, end, err := tokenToNfa(values[0], memory, &State{ + start, end, err := tokenToNfa(values[0], parseCtx, &State{ transitions: map[uint8][]*State{}, }) @@ -199,7 +197,7 @@ func tokenToNfa(token regexToken, memory *parsingContext, startFrom *State) (*St } for i := 1; i < len(values); i++ { - _, endNext, err := tokenToNfa(values[i], memory, end) + _, endNext, err := tokenToNfa(values[i], parseCtx, end) if err != nil { return nil, nil, err @@ -211,21 +209,17 @@ func tokenToNfa(token regexToken, memory *parsingContext, startFrom *State) (*St startFrom.transitions[epsilonChar] = append(startFrom.transitions[epsilonChar], start) return startFrom, end, nil case bracket: - constructTokens := token.value.([]regexToken) - to := &State{ transitions: map[uint8][]*State{}, } - for _, construct := range constructTokens { - ch := construct.value.(uint8) + constructTokens := token.value.(map[uint8]bool) + for ch := range constructTokens { startFrom.transitions[ch] = []*State{to} } return startFrom, to, nil case bracketNot: - constructTokens := token.value.([]regexToken) - to := &State{ transitions: map[uint8][]*State{}, } @@ -234,8 +228,8 @@ func tokenToNfa(token regexToken, memory *parsingContext, startFrom *State) (*St transitions: map[uint8][]*State{}, } - for _, construct := range constructTokens { - ch := construct.value.(uint8) + constructTokens := token.value.(map[uint8]bool) + for ch := range constructTokens { startFrom.transitions[ch] = []*State{deadEnd} } startFrom.transitions[anyChar] = []*State{to} @@ -253,7 +247,7 @@ func tokenToNfa(token regexToken, memory *parsingContext, startFrom *State) (*St return startFrom, startFrom, nil case backReference: groupName := token.value.(string) - if _, ok := memory.capturedGroups[groupName]; !ok { + if _, ok := parseCtx.capturedGroups[groupName]; !ok { return nil, nil, &RegexError{ Code: CompilationError, Message: fmt.Sprintf("Group (%s) does not exist", groupName), @@ -277,7 +271,7 @@ func tokenToNfa(token regexToken, memory *parsingContext, startFrom *State) (*St } } -func handleQuantifierToToken(token regexToken, memory *parsingContext, startFrom *State) (*State, *State, *RegexError) { +func handleQuantifierToToken(token regexToken, parseCtx *parsingContext, startFrom *State) (*State, *State, *RegexError) { payload := token.value.(quantifierPayload) // the minimum amount of time the NFA needs to repeat min := payload.min @@ -310,7 +304,7 @@ func handleQuantifierToToken(token regexToken, memory *parsingContext, startFrom } else { value = token.value.([]regexToken)[0] } - previousStart, previousEnd, err := tokenToNfa(value, memory, &State{ + previousStart, previousEnd, err := tokenToNfa(value, parseCtx, &State{ transitions: map[uint8][]*State{}, }) @@ -323,7 +317,7 @@ func handleQuantifierToToken(token regexToken, memory *parsingContext, startFrom // starting from 2, because the one above is the first one for i := 2; i <= total; i++ { // the same NFA needs to be generated 'total' times - start, end, err := tokenToNfa(value, memory, &State{ + start, end, err := tokenToNfa(value, parseCtx, &State{ transitions: map[uint8][]*State{}, }) diff --git a/parse.go b/parse.go index 033e433..8b5864b 100644 --- a/parse.go +++ b/parse.go @@ -151,27 +151,28 @@ func isQuantifier(ch uint8) bool { return ok } -func parseBracket(regexString string, memory *parsingContext) *RegexError { +func parseBracket(regexString string, context *parsingContext) *RegexError { var pieces []string var tokenType regexTokenType - if regexString[memory.loc()] == '^' { + // do we need to negate the bracket? + if regexString[context.loc()] == '^' { tokenType = bracketNot - memory.adv() + context.adv() } else { tokenType = bracket } - for memory.loc() < len(regexString) && regexString[memory.loc()] != ']' { - ch := regexString[memory.loc()] + for context.loc() < len(regexString) && regexString[context.loc()] != ']' { + ch := regexString[context.loc()] - if ch == '-' { - nextChar := regexString[memory.loc()+1] // TODO: this might fail if we are at the end of the string + if ch == '-' && context.loc()+1 < len(regexString) { + nextChar := regexString[context.loc()+1] // if - is the first character OR is the last character, it's a literal if len(pieces) == 0 || nextChar == ']' { pieces = append(pieces, fmt.Sprintf("%c", ch)) } else { - memory.adv() // to process the nextChar's position + context.adv() // to process the nextChar's position piece := pieces[len(pieces)-1] if len(piece) == 1 { prevChar := piece[0] @@ -181,15 +182,15 @@ func parseBracket(regexString string, memory *parsingContext) *RegexError { return &RegexError{ Code: SyntaxError, Message: fmt.Sprintf("'%c-%c' range is invalid", prevChar, nextChar), - Pos: memory.loc(), + Pos: context.loc(), } } } else { pieces = append(pieces, fmt.Sprintf("%c", ch)) } } - } else if ch == '\\' { - nextChar := regexString[memory.adv()] // TODO: this might fail if we are at the end of the string + } else if ch == '\\' && context.loc()+1 < len(regexString) { + nextChar := regexString[context.adv()] // TODO: some characters are special: \a does not just mean a, it means alarm ascii char etc. // TODO: maybe in future, I'll implement that as well // TODO: for now, all the escaped characters will be treated as literals @@ -197,14 +198,22 @@ func parseBracket(regexString string, memory *parsingContext) *RegexError { } else { pieces = append(pieces, fmt.Sprintf("%c", ch)) } - memory.adv() + context.adv() + } + + if regexString[context.loc()] != ']' { + return &RegexError{ + Code: SyntaxError, + Message: "Bracket is not closed properly", + Pos: context.loc(), + } } if len(pieces) == 0 { return &RegexError{ Code: SyntaxError, Message: "Bracket should not be empty", - Pos: memory.loc(), + Pos: context.loc(), } } @@ -215,26 +224,18 @@ func parseBracket(regexString string, memory *parsingContext) *RegexError { } } - var finalTokens []regexToken - for ch := range uniqueCharacterPieces { - finalTokens = append(finalTokens, regexToken{ - tokenType: literal, - value: ch, - }) - } - token := regexToken{ tokenType: tokenType, - value: finalTokens, + value: uniqueCharacterPieces, } - memory.tokens = append(memory.tokens, token) + context.tokens = append(context.tokens, token) return nil } -func parseGroup(regexString string, memory *parsingContext) *RegexError { +func parseGroup(regexString string, context *parsingContext) *RegexError { groupContext := parsingContext{ - pos: memory.loc(), + pos: context.loc(), tokens: []regexToken{}, } @@ -279,14 +280,14 @@ func parseGroup(regexString string, memory *parsingContext) *RegexError { name: groupName, }, } - memory.push(token) - memory.advTo(groupContext.loc()) + context.push(token) + context.advTo(groupContext.loc()) return nil } -func parseGroupUncaptured(regexString string, memory *parsingContext) *RegexError { +func parseGroupUncaptured(regexString string, context *parsingContext) *RegexError { groupContext := parsingContext{ - pos: memory.loc(), + pos: context.loc(), tokens: []regexToken{}, } @@ -298,61 +299,57 @@ func parseGroupUncaptured(regexString string, memory *parsingContext) *RegexErro groupContext.adv() } + if regexString[groupContext.loc()] != ')' { + return &RegexError{ + Code: SyntaxError, + Message: "Group has not been properly closed", + Pos: groupContext.loc(), + } + } + token := regexToken{ tokenType: groupUncaptured, value: groupContext.tokens, } - memory.push(token) + context.push(token) if groupContext.loc() >= len(regexString) { - memory.advTo(groupContext.loc()) + context.advTo(groupContext.loc()) } else if regexString[groupContext.loc()] == ')' { - memory.advTo(groupContext.loc() - 1) // advance but do not consume the closing parenthesis + context.advTo(groupContext.loc() - 1) // advance but do not consume the closing parenthesis } return nil } -func parseQuantifier(ch uint8, memory *parsingContext) { - bounds := quantifiersWithBounds[ch] - token := regexToken{ - tokenType: quantifier, - value: quantifierPayload{ - min: bounds[0], - max: bounds[1], - value: memory.removeLast(1), - }, - } - memory.push(token) -} - -func parseLiteral(ch uint8, memory *parsingContext) { - token := regexToken{ - tokenType: literal, - value: ch, - } - memory.push(token) -} - -func processChar(regexString string, memory *parsingContext, ch uint8) *RegexError { +func processChar(regexString string, context *parsingContext, ch uint8) *RegexError { if ch == '(' { - memory.adv() - if err := parseGroup(regexString, memory); err != nil { + context.adv() + if err := parseGroup(regexString, context); err != nil { return err } } else if ch == '[' { - memory.adv() - if err := parseBracket(regexString, memory); err != nil { + context.adv() + if err := parseBracket(regexString, context); err != nil { return err } } else if isQuantifier(ch) { - parseQuantifier(ch, memory) + bounds := quantifiersWithBounds[ch] + token := regexToken{ + tokenType: quantifier, + value: quantifierPayload{ + min: bounds[0], + max: bounds[1], + value: context.removeLast(1), + }, + } + context.push(token) } else if ch == '{' { - if err := parseBoundedQuantifier(regexString, memory); err != nil { + if err := parseBoundedQuantifier(regexString, context); err != nil { return err } } else if ch == '\\' { // escaped backslash - if err := parseBackslash(regexString, memory); err != nil { + if err := parseBackslash(regexString, context); err != nil { return err } } else if isWildcard(ch) { @@ -360,28 +357,32 @@ func processChar(regexString string, memory *parsingContext, ch uint8) *RegexErr tokenType: wildcard, value: ch, } - memory.push(token) + context.push(token) } else if isLiteral(ch) { - parseLiteral(ch, memory) + token := regexToken{ + tokenType: literal, + value: ch, + } + context.push(token) } else if ch == '|' { // everything to the left of the pipe in this specific "parsingContext" // is considered as the left side of the OR left := regexToken{ tokenType: groupUncaptured, - value: memory.removeLast(len(memory.tokens)), + value: context.removeLast(len(context.tokens)), } - memory.adv() // to not get stuck in the pipe char - if err := parseGroupUncaptured(regexString, memory); err != nil { + context.adv() // to not get stuck in the pipe char + if err := parseGroupUncaptured(regexString, context); err != nil { return err } - right := memory.removeLast(1)[0] // TODO: better error handling? + right := context.removeLast(1)[0] // TODO: better error handling? token := regexToken{ tokenType: or, value: []regexToken{left, right}, } - memory.push(token) + context.push(token) } else if ch == '^' || ch == '$' { // anchors var tokenType = regexTokenType(textBeginning) @@ -393,18 +394,18 @@ func processChar(regexString string, memory *parsingContext, ch uint8) *RegexErr tokenType: tokenType, value: ch, } - memory.push(token) + context.push(token) } return nil } -func parseBoundedQuantifier(regexString string, memory *parsingContext) *RegexError { - startPos := memory.adv() - var endPos = memory.loc() +func parseBoundedQuantifier(regexString string, context *parsingContext) *RegexError { + startPos := context.adv() + var endPos = context.loc() for regexString[endPos] != '}' { endPos++ } - memory.advTo(endPos) + context.advTo(endPos) expr := regexString[startPos:endPos] pieces := strings.Split(expr, ",") @@ -457,42 +458,42 @@ func parseBoundedQuantifier(regexString string, memory *parsingContext) *RegexEr value: quantifierPayload{ min: start, max: end, - value: memory.removeLast(1), + value: context.removeLast(1), }, } - memory.push(token) + context.push(token) return nil } -func parseBackslash(regexString string, memory *parsingContext) *RegexError { - nextChar := regexString[memory.loc()+1] +func parseBackslash(regexString string, context *parsingContext) *RegexError { + nextChar := regexString[context.loc()+1] if isNumeric(nextChar) { // cares about the next single digit token := regexToken{ tokenType: backReference, value: fmt.Sprintf("%c", nextChar), } - memory.push(token) - memory.adv() + context.push(token) + context.adv() } else if nextChar == 'k' { // \k reference - memory.adv() - if regexString[memory.adv()] == '<' { + context.adv() + if regexString[context.adv()] == '<' { groupName := "" - for regexString[memory.adv()] != '>' { - nextChar = regexString[memory.loc()] + for regexString[context.adv()] != '>' { + nextChar = regexString[context.loc()] groupName += fmt.Sprintf("%c", nextChar) } token := regexToken{ tokenType: backReference, value: groupName, } - memory.push(token) - memory.adv() + context.push(token) + context.adv() } else { return &RegexError{ Code: SyntaxError, Message: "Invalid backreference syntax", - Pos: memory.loc(), + Pos: context.loc(), } } } else if _, canBeEscaped := mustBeEscapedCharacters[nextChar]; canBeEscaped { @@ -500,9 +501,11 @@ func parseBackslash(regexString string, memory *parsingContext) *RegexError { tokenType: literal, value: nextChar, } - memory.push(token) - memory.adv() + context.push(token) + context.adv() } else { + // we're treating newline and tab as special characters + // and not the rest if nextChar == 'n' { nextChar = '\n' } else if nextChar == 't' { @@ -512,20 +515,20 @@ func parseBackslash(regexString string, memory *parsingContext) *RegexError { tokenType: literal, value: nextChar, } - memory.push(token) - memory.adv() + context.push(token) + context.adv() } return nil } -func parse(regexString string, memory *parsingContext) *RegexError { - for memory.loc() < len(regexString) { - ch := regexString[memory.loc()] - if err := processChar(regexString, memory, ch); err != nil { +func parse(regexString string, context *parsingContext) *RegexError { + for context.loc() < len(regexString) { + ch := regexString[context.loc()] + if err := processChar(regexString, context, ch); err != nil { return err } - memory.adv() + context.adv() } return nil } diff --git a/util.go b/util.go index 066863c..50b89d0 100644 --- a/util.go +++ b/util.go @@ -74,6 +74,8 @@ func dot(s *State, processedStateForDot map[string]bool) { label = "any" } else if char == epsilonChar { label = "ε" + } else if char == '\\' { + label = "backslash" } else { label = fmt.Sprintf("\"%c\"", char) }