Add dfa2regex.py

za3k · Oct 16, 2020 · 67189b1 · 67189b1
1 parent f3c67f9
commit 67189b1
Show file tree

Hide file tree

Showing 2 changed files with 181 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -74,6 +74,10 @@ deshebang
 ---
 Usage: `#!/usr/bin/env deshebang` at the top of a file, makes that file self-printing.
 
+dfa2regex.py
+---
+Needs to be hand-edited. Makes a regex out of any DFA.
+
 dzen-clock
 ---
 Make a small clock at the bottom of the screen

diff --git a/dfa2regex.py b/dfa2regex.py
@@ -0,0 +1,177 @@
+import itertools, functools, copy, re
+
+# x%7 == 0 dfa
+DFA = {str(x): { "0": str((x*2)%7), "1": str((x*2+1)%7)  } for x in range(7) }
+DFA_START = "0"
+DFA_ACCEPT = {"0"}
+
+# Make sure there's a unique start and end state with no transitions in or out
+DFA["start"] = copy.copy(DFA[DFA_START])
+DFA["end"] = {}
+for state in DFA_ACCEPT:
+    DFA[state][""] = "end"
+dfa = copy.deepcopy(DFA)
+
+def atom(x):
+    return (0, x)
+def parenthesize(r1):
+    if r1[0] == 0:
+        return r1
+    return (0, "({})".format(r1[1]))
+def kleene_star(r1):
+    if r1[0] > 1:
+        r1 = parenthesize(r1)
+    return (1, "{}*".format(r1[1]))
+def sequence(r1, r2):
+    if r1[0] > 2:
+        r1 = parenthesize(r1)
+    if r2[0] > 2:
+        r2 = parenthesize(r2)
+    return (2,  "{}{}".format(r1[1], r2[1]))
+def or_(r1, r2):
+    return (3, "{}|{}".format(r1[1], r2[1]))
+for state in DFA:
+    dfa[state] = {atom(key): dfa[state][key] for key in dfa[state]}
+
+def combine_duplicate_transitions(t, s1):
+    # If there are multiple transitions S1-A->S2, S1->B->S2, combine them as S1-A|B->S2
+    outgoing = [(t[s1][key], key) for key in t[s1]]
+    for target, group in itertools.groupby(sorted(outgoing), lambda x: x[0]):
+        group = list(group)
+        if len(group) <= 1:
+            continue
+        keys = []
+        for (target, key) in group:
+            keys.append(key)
+            assert t[s1][key] == target
+            del t[s1][key]
+        newkey = functools.reduce(or_, keys)
+        t[s1][newkey] = target
+def find_self_transition(t, s1):
+    for key in t[s1]:
+        if t[s1][key] == s1:
+            return key
+    return None
+def remove_self_transitions(t, s1):
+    while True:
+        key = find_self_transition(t, s1)
+        if key is None:
+            return
+        # If there's any transition S1 -A-> S1, remove it.
+        # For every S1 -B-> S2, replace it with S1 -A*B-> S2
+        assert t[s1][key] == s1
+        del t[s1][key]
+        ks = kleene_star(key)
+        t[s1] = {sequence(ks, key2): t[s1][key2] for key2 in t[s1]}
+def remove_transition(t, s1, key):
+    # To replace the transition S1 -A-> S2:
+    # For every S2 -B-> S3, add S1 -AB -> S3.
+    # Then S1 -A-> S2 can be removed.
+    s2 = t[s1].pop(key)
+    assert s1 != s2
+    #print(inp, poss_keys, len(poss_keys), state, t[state])#, t)
+    for key2 in t[s2]:
+        combined = sequence(key, key2)
+        print(s1, "-", key, "->", s2, "-", key2, "->", t[s2][key2], "=", s1 , "-", combined, "->", t[s2][key2])
+        t[s1][combined] = t[s2][key2]
+def remove_state(t, s2):
+    # S2 is not the start or end state, and you want to remove it.
+    # Remove all incoming transitions to S2. It can be deleted as unreachable.
+    assert find_self_transition(t, s2) is None
+    assert s2 in t
+    incoming = []
+    for s1 in t:
+        for key in t[s1]:
+            if t[s1][key] == s2:
+                incoming.append([s1, key])
+    for (s1, key) in incoming:
+        remove_transition(t, s1, key)
+    del t[s2]
+
+def simplify(t):
+    for s in t:
+        combine_duplicate_transitions(t, s)
+        print_dfa("Combined duplicates {}".format(s), t)
+        remove_self_transitions(t, s)
+        print_dfa("Removed self-transitions {}".format(s), t)
+        combine_duplicate_transitions(t, s)
+        print_dfa("Combined duplicates {}".format(s), t)
+
+def simulate_state(dfa, s):
+    state = "start"
+    for x in s:
+        state = dfa[state][x]
+    return state
+def simulate_t(t, inp, debug=False):
+    state = "start"
+    orig_inp = inp
+    while True:
+        poss_keys = []
+        for key in t[state]:
+            if re.match(key[1], inp):
+                poss_keys.append(key)
+        if debug:
+            print(orig_inp, inp, poss_keys, len(poss_keys), state, t[state])#, t)
+        if len(poss_keys) == 0:
+            return (state == "end" and not inp)
+        else:
+            assert len(poss_keys) >= 1
+            key = poss_keys[0]
+            m = re.match(key[1], inp)
+            chars = len(m.group(0))
+            state, inp = t[state][key], inp[chars:]
+def verify_against(t, f, tests=["{:b}".format(x) for x in range(100)]):
+    for x in tests:
+        if simulate_t(t, x) != f(x):
+            print("INCORRECT ON {}: {} {}".format(x, simulate_t(t, x), f(x)))
+            simulate_t(t, x, debug=True)
+            return False
+    return True
+
+def repr_dfa(t):
+    out = "State\tRegex\tTo\n"
+    for state in sorted(t.keys()):
+        for key in sorted(t[state].keys()):
+            out += "{}\t{}\t{}\n".format(state, key, t[state][key])
+    return out
+last = ""
+def print_dfa(reason, t):
+    global last
+    cur = repr_dfa(t)
+    if cur == last:
+        return
+    print(reason)
+    print(cur)
+    last = cur
+    if not verify_against(t, lambda x: int(x,2)%7==0):
+        print("WRONG")
+
+def dfa2regex(t):
+    for x in [str(x) for x in range(7)]:
+        print_dfa("Initial configuration", t)
+        simplify(t)
+        remove_state(t, x)
+        print_dfa("Removed state {}".format(x), t)
+    simplify(t)
+    assert set(t.keys()) == {"start", "end"}
+    assert t["end"] == {}
+    assert len(t["start"]) == 1
+    regex = list(t["start"].keys())[0]
+    assert t["start"][regex] == "end"
+    return regex[1]
+
+regex = dfa2regex(dfa)
+regex = "^({})$".format(regex)
+r = re.compile(regex)
+print("number\tcorrect\tDFA\tregex")
+correct = 0
+for x in range(100):
+    bin_ = bin(x)[2:]
+    print("{}\t{}\t{}\t{}".format(x, x%7, bin_, simulate_state(DFA, bin_), bool(r.match(bin_))))
+    if bool(r.fullmatch(bin_)) == bool(x%7==0):
+        correct += 1
+    if bool(r.match(bin_)) == bool(x%7==0):
+        correct += 1
+print(correct)
+
+print(regex)