-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanalysis.py
205 lines (146 loc) · 7.54 KB
/
analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
import re
from ciphers import analysis_ciphers
from dictionary import dictionary_all, AnagramLookupTable
from math import log2
import time
from collections.abc import Callable
# Frequency analysis
def analyze_frequencies(string: str) -> dict[str, int]:
string = string.lower()
symbols = []
if " " in string:
symbols = string.split(" ")
if any([len(s) > 4 for s in symbols]):
symbols = [c for c in string if c.isalnum()] # Only works for alphanumeric symbols
else:
symbols = [c for c in string]
return { s: string.count(s) for s in symbols }
def calculate_entropy(probs: list[float]) -> float:
return sum([(-1.0) * p * log2(p) for p in probs])
# Transforms
def string_reverse(string: str) -> tuple[str, str]:
return "".join(reversed(string)), "reverse"
def string_reverse_groups(string: str) -> tuple[str, str]:
return " ".join([string_reverse(g)[0] for g in string.split(" ")]), "reverse groups"
def string_reverse_group_order(string: str) -> tuple[str, str]:
return " ".join(reversed(string.split(" "))), "reverse group order"
def string_bigram_substitue(string: str) -> tuple[str, str] | None:
if not " " in string and len(string) % 2 != 0:
return None
pairs = string.split(" ") if " " in string else [string[i:i+2] for i in range(0, len(string), 2)]
result = ""
substitutes_by_pair = {}
next_char = 97
for p in pairs:
if p not in substitutes_by_pair:
substitutes_by_pair[p] = chr(next_char)
next_char += 1
result += substitutes_by_pair[p]
return result, "bigram group"
def apply_all_ciphers(string: str) -> list[tuple[str, str]]:
results = []
for cipher in analysis_ciphers:
encoded = cipher.encode(string.encode("utf-8"))
if isinstance(encoded, list):
results += [(e, type(cipher).__name__) for e in encoded]
elif encoded is not None:
results.append((encoded, type(cipher).__name__))
decoded = cipher.decode(string)
if isinstance(decoded, list):
results += [(d.decode("utf-8"), type(cipher).__name__) for d in decoded]
elif decoded is not None:
results.append((decoded.decode("utf-8"), type(cipher).__name__))
return results
analysis_anagram_lookup_table = AnagramLookupTable(dictionary_all)
def apply_anagram_search(string: str) -> list[tuple[str, str]]:
if "?" in string:
return []
anagrams = analysis_anagram_lookup_table.lookup(string)
return [(a, "anagram") for a in anagrams] if anagrams is not None else []
def remove_spaces(string: str) -> tuple[str, str]:
return string.replace(" ", ""), "remove spaces"
transforms = [string_reverse, string_reverse_group_order, string_reverse_groups, remove_spaces, apply_all_ciphers, apply_anagram_search]
# Validators
def is_isbn(string: str, dictionary: list[str]) -> bool:
d = [int(c) if c.isdigit() else 10 for c in string.lower() if c.isdigit() or c == "x"]
if len(d) == 10:
d = [9, 8, 7] + d # prepend 987 to convert to 13-digit ISBN
if len(d) != 13:
return False
checksum = (1 * d[0] + 3 * d[1] + 1 * d[2] + 3 * d[3] + 1 * d[4] + 3 * d[5] + 1 * d[6] + 3 * d[7] + 1 * d[8] + 3 * d[9] + 1 * d[10] + 3 * d[11] + 1 * d[12]) % 10
return checksum == 0
def is_part_word(string: str, dictionary: list[str]) -> bool:
words = []
special_chars = ["-", ":", ".", ",", ";", "_", ]
if " " in string:
words += [re.sub(r"[^a-z]", "", w.lower()) for w in string.split(" ")]
if any([it in string for it in special_chars]):
words += [re.sub(r"[^a-z]", "", w.lower()) for w in re.sub(r"[\-:\.,;_]", " ", string).split(" ")]
if len(words) == 0:
words.append(string)
return any([word in dictionary for word in words if len(word) >= 3])
def is_munich_phone_number(string: str, dictionary: list[str]) -> bool:
return re.sub(r"\D", "", string).startswith("4989")
def could_be_coordinate(string: str, dictionary: list[str]) -> bool:
return (any([it in string for it in ["N", "S"]]) and any([it in string for it in ["E", "W"]])) and any([it.isdigit() for it in string])
validators = [is_part_word, is_isbn, is_munich_phone_number, could_be_coordinate]
# Bruteforce analysis
class BruteforceResult:
string: str
path: list[str]
depth: int
validator: Callable[[str, list[str]], bool] | None
def __init__(self, string: str, path: list[str], depth: int):
self.string = string
self.path = path
self.depth = depth
self.validator = None
def __str__(self) -> str:
return f"{self.string} --- {"->".join(self.path)} --- {self.validator.__name__ if self.validator is not None else "none"}"
def bruteforce_string(string: str, timeout_stamp: float, path: list[str] = [], total_iterations: int = 3, depth: int = 0, seen_already: set[str] = set()) -> set[BruteforceResult]:
if depth == total_iterations or time.time() > timeout_stamp:
return set()
string = string.lower()
results = set()
for transform in transforms:
# generate candidates
new_candidates = transform(string)
if isinstance(new_candidates, list):
new_results = [BruteforceResult(c[0], path + [c[1]], depth + 1) for c in set(new_candidates)]
else:
new_results = [BruteforceResult(new_candidates[0], path + [new_candidates[1]], depth + 1)]
# filter already seen, long, unknown, non alphanumerical
new_results = [r for r in new_results if not r.string in seen_already]
new_results = [r for r in new_results if len(r.string) < 200 and r.string.count("?") < 5]
new_results = [r for r in new_results if any([c.isalnum() for c in r.string]) and not "No spaces in string" in r.string]
# for r in new_results:
# print(r.string)
# todo move deepen and validation out of transform iteration?
# deepen
seen_already = seen_already.union(set([r.string for r in new_results])) # previous deeper iterations are now unseen, could be returned and passed around
new_results += [ cr for r in new_results for cr in bruteforce_string(r.string, timeout_stamp, r.path, total_iterations, depth + 1, seen_already) ] # list comprehensions are magic
# add results
results = results.union(set(new_results))
# validate
for r in results:
for v in validators:
if v(r.string, dictionary_all):
r.validator = v
break
results = set([r for r in results if r.validator is not None])
return results
def bruteforce_string_filter_sort(string: str, timeout_stamp: float, total_iterations: int = 4):
results = bruteforce_string(string, timeout_stamp=timeout_stamp, total_iterations=total_iterations)
results_filtered = []
for r in sorted(results, key=lambda r: r.depth):
if not r.string in [it.string for it in results_filtered]:
results_filtered.append(r)
results_filtered = list(sorted(results_filtered, key=lambda r: validators.index(r.validator)))
return results_filtered
# Test
if __name__ == "__main__":
#results = bruteforce_string_filter_sort("3 22 4 9 99 55 9999 8 55 6 99 4", total_iterations=4)
results = bruteforce_string_filter_sort("..- .--- -.-. -. --.- -.- .-.. .-- --.. ....", timeout_stamp=time.time() + 30.0, total_iterations=4)
results = bruteforce_string_filter_sort("24 22 23 5 23 18 23 18 21 8 18", timeout_stamp=time.time() + 30.0, total_iterations=4)
for r in results:
print(r)