From 0941c12a324fadd0e786479e6c2206f947d75bd1 Mon Sep 17 00:00:00 2001 From: Rob van der Leek <5324924+robvanderleek@users.noreply.github.com> Date: Tue, 2 Apr 2024 22:59:10 +0200 Subject: [PATCH] Finalizing GSM... --- codelimit/common/ScanTotals.py | 2 +- codelimit/common/gsm/Atom.py | 10 +++--- codelimit/common/gsm/Automata.py | 16 ++++----- codelimit/common/gsm/Concat.py | 6 ++-- codelimit/common/gsm/DFA.py | 14 ++++++++ codelimit/common/gsm/Expression.py | 30 +++++++++++----- codelimit/common/gsm/NFA.py | 14 ++++++++ codelimit/common/gsm/OneOrMore.py | 10 +++--- codelimit/common/gsm/Operator.py | 4 +-- codelimit/common/gsm/Optional.py | 10 +++--- codelimit/common/gsm/Pattern.py | 10 ++++-- codelimit/common/gsm/Predicate.py | 11 ++++++ codelimit/common/gsm/State.py | 2 +- codelimit/common/gsm/Union.py | 10 +++--- codelimit/common/gsm/ZeroOrMore.py | 10 +++--- codelimit/common/gsm/matcher.py | 28 ++++++--------- codelimit/common/gsm/utils.py | 34 +++++++++++++------ .../predicates/TokenPredicate.py | 5 +-- .../common/token_matching/predicates/Value.py | 8 +++-- pyproject.toml | 3 ++ tests/common/gsm/test_matcher.py | 2 +- 21 files changed, 153 insertions(+), 86 deletions(-) create mode 100644 codelimit/common/gsm/DFA.py create mode 100644 codelimit/common/gsm/NFA.py create mode 100644 codelimit/common/gsm/Predicate.py diff --git a/codelimit/common/ScanTotals.py b/codelimit/common/ScanTotals.py index 5ad123d..5eb72b2 100644 --- a/codelimit/common/ScanTotals.py +++ b/codelimit/common/ScanTotals.py @@ -3,7 +3,7 @@ class ScanTotals: - def __init__(self): + def __init__(self) -> None: self._languages_totals: dict[str, LanguageTotals] = {} def add(self, entry: SourceFileEntry): diff --git a/codelimit/common/gsm/Atom.py b/codelimit/common/gsm/Atom.py index f43f4d6..111b8b4 100644 --- a/codelimit/common/gsm/Atom.py +++ b/codelimit/common/gsm/Atom.py @@ -1,14 +1,16 @@ -from codelimit.common.gsm.Automata import Automata +from typing import Any + +from codelimit.common.gsm.NFA import NFA from codelimit.common.gsm.Operator import Operator from codelimit.common.gsm.State import State class Atom(Operator): - def __init__(self, item: str): + def __init__(self, item: Any): self.item = item - def apply(self, stack: list[Automata]): + def apply(self, stack: list[NFA]): start = State() accepting = State() start.transition.append((self.item, accepting)) - stack.append(Automata(start, accepting)) + stack.append(NFA(start, accepting)) diff --git a/codelimit/common/gsm/Automata.py b/codelimit/common/gsm/Automata.py index 136e492..8a0d7e4 100644 --- a/codelimit/common/gsm/Automata.py +++ b/codelimit/common/gsm/Automata.py @@ -1,16 +1,12 @@ +from abc import abstractmethod, ABC + from codelimit.common.gsm.State import State -class Automata: - def __init__(self, start: State, accepting: State | list[State]): +class Automata(ABC): + def __init__(self, start: State): self.start = start - self.accepting = accepting + @abstractmethod def is_accepting(self, state: State) -> bool: - if isinstance(self.accepting, list): - return state in self.accepting - else: - return state == self.accepting - - def __str__(self): - return f'Automata(start={self.start}, accepting={self.accepting})' \ No newline at end of file + pass diff --git a/codelimit/common/gsm/Concat.py b/codelimit/common/gsm/Concat.py index c3ff66d..c3a4bb7 100644 --- a/codelimit/common/gsm/Concat.py +++ b/codelimit/common/gsm/Concat.py @@ -1,13 +1,13 @@ -from codelimit.common.gsm.Automata import Automata +from codelimit.common.gsm.NFA import NFA from codelimit.common.gsm.Operator import Operator class Concat(Operator): - def apply(self, stack: list[Automata]): + def apply(self, stack: list[NFA]): if len(stack) < 2: return nfa1 = stack.pop() nfa2 = stack.pop() nfa2.accepting.assign(nfa1.start) - nfa = Automata(nfa2.start, nfa1.accepting) + nfa = NFA(nfa2.start, nfa1.accepting) stack.append(nfa) diff --git a/codelimit/common/gsm/DFA.py b/codelimit/common/gsm/DFA.py new file mode 100644 index 0000000..0f41b32 --- /dev/null +++ b/codelimit/common/gsm/DFA.py @@ -0,0 +1,14 @@ +from codelimit.common.gsm.Automata import Automata +from codelimit.common.gsm.State import State + + +class DFA(Automata): + def __init__(self, start: State, accepting: list[State]): + super().__init__(start) + self.accepting = accepting + + def is_accepting(self, state: State) -> bool: + return state in self.accepting + + def __str__(self): + return f'DFA(start={self.start}, accepting={self.accepting})' diff --git a/codelimit/common/gsm/Expression.py b/codelimit/common/gsm/Expression.py index 6f82b2a..0450b13 100644 --- a/codelimit/common/gsm/Expression.py +++ b/codelimit/common/gsm/Expression.py @@ -1,15 +1,27 @@ -from typing import Iterable +from typing import Iterable, TypeVar, TypeAlias from codelimit.common.gsm.Atom import Atom -from codelimit.common.gsm.Automata import Automata from codelimit.common.gsm.Concat import Concat +from codelimit.common.gsm.DFA import DFA +from codelimit.common.gsm.NFA import NFA from codelimit.common.gsm.Operator import Operator +from codelimit.common.gsm.Predicate import Predicate from codelimit.common.gsm.State import State +T = TypeVar('T') -def expression_to_nfa(expression: list[Operator | str]) -> Automata: - op_expression = [Atom(item) if isinstance(item, str) else item for item in expression] - nfa_stack = [] +Expression: TypeAlias = Operator | Predicate[T] | T | list[Operator | Predicate[T] | T] + + +def expression_to_nfa(expression: Expression[T]) -> NFA: + if isinstance(expression, list): + op_expression = [Atom(item) if not isinstance(item, Operator) or isinstance(item, Predicate) else + item for item in expression] + + else: + op_expression = [Atom(expression) if not isinstance(expression, Operator) or + isinstance(expression, Predicate) else expression] + nfa_stack: list[NFA] = [] for item in op_expression: item.apply(nfa_stack) Concat().apply(nfa_stack) @@ -20,7 +32,7 @@ def expression_to_nfa(expression: list[Operator | str]) -> Automata: def epsilon_closure(states: State | Iterable[State]) -> set[State]: result = set() if isinstance(states, State): - states: set[State] = {states} + states = {states} for state in states: result.add(state) for s in state.epsilon_transitions: @@ -49,10 +61,10 @@ def state_set_id(states: set[State]) -> str: return ", ".join([str(id) for id in sorted([state.id for state in states])]) -def nfa_to_dfa(nfa: Automata) -> Automata: +def nfa_to_dfa(nfa: NFA) -> DFA: start = State() stack = [(start, epsilon_closure(nfa.start))] - states = {} + states: dict[str, State] = {} accepting_states = [] marked_states = set() while stack: @@ -74,4 +86,4 @@ def nfa_to_dfa(nfa: Automata) -> Automata: states[state_set_id(new_states)] = new_state state.transition.append((atom, new_state)) stack.append((new_state, new_states)) - return Automata(start, accepting_states) + return DFA(start, accepting_states) diff --git a/codelimit/common/gsm/NFA.py b/codelimit/common/gsm/NFA.py new file mode 100644 index 0000000..96dd4a1 --- /dev/null +++ b/codelimit/common/gsm/NFA.py @@ -0,0 +1,14 @@ +from codelimit.common.gsm.Automata import Automata +from codelimit.common.gsm.State import State + + +class NFA(Automata): + def __init__(self, start: State, accepting: State): + super().__init__(start) + self.accepting = accepting + + def is_accepting(self, state: State) -> bool: + return state == self.accepting + + def __str__(self): + return f'NFA(start={self.start}, accepting={self.accepting})' diff --git a/codelimit/common/gsm/OneOrMore.py b/codelimit/common/gsm/OneOrMore.py index 6f6cdb5..2ac92c3 100644 --- a/codelimit/common/gsm/OneOrMore.py +++ b/codelimit/common/gsm/OneOrMore.py @@ -1,17 +1,17 @@ -from codelimit.common.gsm.Expression import expression_to_nfa -from codelimit.common.gsm.Automata import Automata +from codelimit.common.gsm.Expression import expression_to_nfa, Expression +from codelimit.common.gsm.NFA import NFA from codelimit.common.gsm.Operator import Operator from codelimit.common.gsm.State import State class OneOrMore(Operator): - def __init__(self, expression: Operator | str | list[Operator | str]): + def __init__(self, expression: Expression): self.expression = expression if isinstance(expression, list) else [expression] - def apply(self, stack: list[Automata]): + def apply(self, stack: list[NFA]): start = State() nfa = expression_to_nfa(self.expression) accepting = State() start.epsilon_transitions = [nfa.start] nfa.accepting.epsilon_transitions = [nfa.start, accepting] - stack.append(Automata(start, accepting)) + stack.append(NFA(start, accepting)) diff --git a/codelimit/common/gsm/Operator.py b/codelimit/common/gsm/Operator.py index dc3385c..082daf6 100644 --- a/codelimit/common/gsm/Operator.py +++ b/codelimit/common/gsm/Operator.py @@ -1,9 +1,9 @@ from abc import ABC, abstractmethod -from codelimit.common.gsm.Automata import Automata +from codelimit.common.gsm.NFA import NFA class Operator(ABC): @abstractmethod - def apply(self, stack: list[Automata]): + def apply(self, stack: list[NFA]): pass diff --git a/codelimit/common/gsm/Optional.py b/codelimit/common/gsm/Optional.py index 109d548..955159b 100644 --- a/codelimit/common/gsm/Optional.py +++ b/codelimit/common/gsm/Optional.py @@ -1,17 +1,17 @@ -from codelimit.common.gsm.Expression import expression_to_nfa -from codelimit.common.gsm.Automata import Automata +from codelimit.common.gsm.Expression import expression_to_nfa, Expression +from codelimit.common.gsm.NFA import NFA from codelimit.common.gsm.Operator import Operator from codelimit.common.gsm.State import State class Optional(Operator): - def __init__(self, expression: Operator | str | list[Operator | str]): + def __init__(self, expression: Expression): self.expression = expression if isinstance(expression, list) else [expression] - def apply(self, stack: list[Automata]): + def apply(self, stack: list[NFA]): start = State() nfa = expression_to_nfa(self.expression) accepting = State() start.epsilon_transitions = [nfa.start, accepting] nfa.accepting.epsilon_transitions = [accepting] - stack.append(Automata(start, accepting)) + stack.append(NFA(start, accepting)) diff --git a/codelimit/common/gsm/Pattern.py b/codelimit/common/gsm/Pattern.py index 631d5f7..fd4aa9a 100644 --- a/codelimit/common/gsm/Pattern.py +++ b/codelimit/common/gsm/Pattern.py @@ -1,8 +1,12 @@ -from codelimit.common.gsm.State import State +from codelimit.common.gsm.Automata import Automata class Pattern: - def __init__(self, start: int, state: State): + def __init__(self, start: int, automata: Automata): self.start = start - self.state = state + self.automata = automata + self.state = automata.start self.tokens: list = [] + + def is_accepting(self): + return self.automata.is_accepting(self.state) diff --git a/codelimit/common/gsm/Predicate.py b/codelimit/common/gsm/Predicate.py new file mode 100644 index 0000000..64b75f6 --- /dev/null +++ b/codelimit/common/gsm/Predicate.py @@ -0,0 +1,11 @@ +from abc import ABC, abstractmethod +from typing import Generic, TypeVar + +T = TypeVar('T') + + +class Predicate(ABC, Generic[T]): + + @abstractmethod + def accept(self, item: T) -> bool: + pass diff --git a/codelimit/common/gsm/State.py b/codelimit/common/gsm/State.py index 66fcf1e..65b86fb 100644 --- a/codelimit/common/gsm/State.py +++ b/codelimit/common/gsm/State.py @@ -4,7 +4,7 @@ class State: _id = 1 - def __init__(self): + def __init__(self) -> None: self.id = State._id State._id += 1 self.transition: list[tuple[str, State]] = [] diff --git a/codelimit/common/gsm/Union.py b/codelimit/common/gsm/Union.py index 7c88484..0419840 100644 --- a/codelimit/common/gsm/Union.py +++ b/codelimit/common/gsm/Union.py @@ -1,15 +1,15 @@ -from codelimit.common.gsm.Expression import expression_to_nfa -from codelimit.common.gsm.Automata import Automata +from codelimit.common.gsm.Expression import expression_to_nfa, Expression +from codelimit.common.gsm.NFA import NFA from codelimit.common.gsm.Operator import Operator from codelimit.common.gsm.State import State class Union(Operator): - def __init__(self, left: Operator | str | list[Operator | str], right: Operator | str | list[Operator | str]): + def __init__(self, left: Expression, right: Expression): self.left = left if isinstance(left, list) else [left] self.right = right if isinstance(right, list) else [right] - def apply(self, stack: list[Automata]): + def apply(self, stack: list[NFA]): start = State() nfa1 = expression_to_nfa(self.left) nfa2 = expression_to_nfa(self.right) @@ -17,4 +17,4 @@ def apply(self, stack: list[Automata]): accepting = State() nfa1.accepting.epsilon_transitions = [accepting] nfa2.accepting.epsilon_transitions = [accepting] - stack.append(Automata(start, accepting)) + stack.append(NFA(start, accepting)) diff --git a/codelimit/common/gsm/ZeroOrMore.py b/codelimit/common/gsm/ZeroOrMore.py index 659a33e..859b2c6 100644 --- a/codelimit/common/gsm/ZeroOrMore.py +++ b/codelimit/common/gsm/ZeroOrMore.py @@ -1,17 +1,17 @@ -from codelimit.common.gsm.Expression import expression_to_nfa -from codelimit.common.gsm.Automata import Automata +from codelimit.common.gsm.Expression import expression_to_nfa, Expression +from codelimit.common.gsm.NFA import NFA from codelimit.common.gsm.Operator import Operator from codelimit.common.gsm.State import State class ZeroOrMore(Operator): - def __init__(self, expression: Operator | str | list[Operator | str]): + def __init__(self, expression: Expression): self.expression = expression if isinstance(expression, list) else [expression] - def apply(self, stack: list[Automata]): + def apply(self, stack: list[NFA]): start = State() nfa = expression_to_nfa(self.expression) accepting = State() start.epsilon_transitions = [nfa.start, accepting] nfa.accepting.epsilon_transitions = [nfa.start, accepting] - stack.append(Automata(start, accepting)) + stack.append(NFA(start, accepting)) diff --git a/codelimit/common/gsm/matcher.py b/codelimit/common/gsm/matcher.py index d0fd426..d1546ea 100644 --- a/codelimit/common/gsm/matcher.py +++ b/codelimit/common/gsm/matcher.py @@ -1,17 +1,19 @@ -import subprocess -import tempfile +import copy +from typing import TypeVar -from codelimit.common.gsm.Automata import Automata from codelimit.common.gsm.Expression import expression_to_nfa, epsilon_closure, nfa_to_dfa from codelimit.common.gsm.Operator import Operator from codelimit.common.gsm.Pattern import Pattern -from codelimit.common.gsm.utils import to_dot +from codelimit.common.gsm.Predicate import Predicate +from codelimit.common.gsm.utils import render_automata +T = TypeVar('T') -def match(expression: list[Operator | str], text: list) -> Pattern | None: + +def match(expression: Operator | Predicate[T] | T | list[Operator | Predicate[T] | T], text: list) -> Pattern | None: nfa = expression_to_nfa(expression) dfa = nfa_to_dfa(nfa) - pattern = Pattern(dfa.start) + pattern = Pattern(0, dfa) for char in text: next_state = None for transition in pattern.state.transition: @@ -35,7 +37,8 @@ def find_all(expression: list[Operator | str], text: list) -> list[Pattern]: active_patterns = [] last_match_idx = -1 for idx, char in enumerate(text): - active_patterns.append(Pattern(idx, dfa.start)) + dfa_copy = copy.deepcopy(dfa) + active_patterns.append(Pattern(idx, dfa_copy)) next_state_patterns = [] for pattern in active_patterns: if pattern.start <= last_match_idx: @@ -46,7 +49,7 @@ def find_all(expression: list[Operator | str], text: list) -> list[Pattern]: pattern.state = transition[1] next_state_patterns.append(pattern) else: - if pattern.state in dfa.accepting: + if pattern.is_accepting(): matches.append(pattern) last_match_idx = idx active_patterns = next_state_patterns @@ -75,12 +78,3 @@ def render_nfa(expression: list[Operator | str]): def render_dfa(expression: list[Operator | str]): render_automata(nfa_to_dfa(expression_to_nfa(expression))) - - -def render_automata(automata: Automata): - dot = to_dot(automata) - with tempfile.NamedTemporaryFile(mode='w') as f: - f.write(dot) - f.flush() - subprocess.run(['dot', '-Tpdf', f'-o{f.name}.pdf', f.name]) - subprocess.run(['open', f'{f.name}.pdf']) diff --git a/codelimit/common/gsm/utils.py b/codelimit/common/gsm/utils.py index cdfb5e4..edc9afd 100644 --- a/codelimit/common/gsm/utils.py +++ b/codelimit/common/gsm/utils.py @@ -1,26 +1,29 @@ +import subprocess +import tempfile + from codelimit.common.gsm.Automata import Automata from codelimit.common.gsm.State import State -def state_to_dot(nfa: Automata, state: State, dot="", visited=None): +def state_to_dot(automata: Automata, state: State, dot="", visited=None): if visited is None: visited = [] if state in visited: return dot visited.append(state) - if nfa.is_accepting(state): + if automata.is_accepting(state): dot += f'{state.id} [label="{state.id}" peripheries=2]\n' else: dot += f'{state.id} [label="{state.id}"]\n' for transition in state.transition: target = transition[1] - dot = state_to_dot(nfa, target, dot, visited) + dot = state_to_dot(automata, target, dot, visited) for target in state.epsilon_transitions: - dot = state_to_dot(nfa, target, dot, visited) + dot = state_to_dot(automata, target, dot, visited) return dot -def state_transitions_to_dot(nfa: Automata, state: State, dot="", visited=None): +def state_transitions_to_dot(automata: Automata, state: State, dot="", visited=None): if visited is None: visited = [] if state in visited: @@ -30,19 +33,28 @@ def state_transitions_to_dot(nfa: Automata, state: State, dot="", visited=None): char = transition[0] target = transition[1] dot += f'{state.id} -> {target.id} [label="{char}"]\n' - dot = state_transitions_to_dot(nfa, target, dot, visited) + dot = state_transitions_to_dot(automata, target, dot, visited) for target in state.epsilon_transitions: dot += f'{state.id} -> {target.id} [label="ε"]\n' - dot = state_transitions_to_dot(nfa, target, dot, visited) + dot = state_transitions_to_dot(automata, target, dot, visited) return dot -def to_dot(nfa: Automata): +def render_automata(automata: Automata): + dot = to_dot(automata) + with tempfile.NamedTemporaryFile(mode='w') as f: + f.write(dot) + f.flush() + subprocess.run(['dot', '-Tpdf', f'-o{f.name}.pdf', f.name]) + subprocess.run(['open', f'{f.name}.pdf']) + + +def to_dot(automata: Automata): result = "digraph {\n" result += 'rankdir="LR"\n' result += 'start [label = "start", style = "invis"]\n' - result += state_to_dot(nfa, nfa.start) - result += f'start -> {nfa.start.id} [label = "start"]\n' - result += state_transitions_to_dot(nfa, nfa.start) + result += state_to_dot(automata, automata.start) + result += f'start -> {automata.start.id} [label = "start"]\n' + result += state_transitions_to_dot(automata, automata.start) result += "}" return result diff --git a/codelimit/common/token_matching/predicates/TokenPredicate.py b/codelimit/common/token_matching/predicates/TokenPredicate.py index 69f333b..1d7449f 100644 --- a/codelimit/common/token_matching/predicates/TokenPredicate.py +++ b/codelimit/common/token_matching/predicates/TokenPredicate.py @@ -1,9 +1,10 @@ -from abc import ABC, abstractmethod +from abc import abstractmethod from codelimit.common.Token import Token +from codelimit.common.gsm.Predicate import Predicate -class TokenPredicate(ABC): +class TokenPredicate(Predicate[Token]): def __init__(self): self.satisfied = False diff --git a/codelimit/common/token_matching/predicates/Value.py b/codelimit/common/token_matching/predicates/Value.py index 8f09be7..93c02d6 100644 --- a/codelimit/common/token_matching/predicates/Value.py +++ b/codelimit/common/token_matching/predicates/Value.py @@ -1,14 +1,18 @@ +from typing import TypeVar + from codelimit.common.Token import Token from codelimit.common.token_matching.predicates.TokenPredicate import TokenPredicate +T = TypeVar('T') + class Value(TokenPredicate): - def __init__(self, value: str): + def __init__(self, value: T): super().__init__() self.value = value def accept(self, token: Token) -> bool: - if token.value == self.value: + if token.value == str(self.value): self.satisfied = True return True return False diff --git a/pyproject.toml b/pyproject.toml index a0e9c46..24caf53 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,6 +25,9 @@ addopts = "--tb=short" help = "Create a binary executable using pyinstaller" cmd = "pyinstaller --workpath .build --specpath dist -n codelimit codelimit/__main__.py" +[tool.mypy] +ignore_missing_imports = true + [tool.semantic_release] branch = "main" version_toml = [ "pyproject.toml:tool.poetry.version" ] diff --git a/tests/common/gsm/test_matcher.py b/tests/common/gsm/test_matcher.py index 504e17f..19ae203 100644 --- a/tests/common/gsm/test_matcher.py +++ b/tests/common/gsm/test_matcher.py @@ -33,7 +33,7 @@ def test_to_string(): expr = ['a', 'b'] nfa = expression_to_nfa(expr) - assert str(nfa) == 'Automata(start=State(1), accepting=State(4))' + assert str(nfa) == 'NFA(start=State(1), accepting=State(4))' def test_to_dot():