-
Notifications
You must be signed in to change notification settings - Fork 0
/
mimir_string_matching.py
155 lines (128 loc) · 4.86 KB
/
mimir_string_matching.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#!/usr/bin/python3.6
import os
import csv
import re
import sys
import getopt
import numpy as np
USAGE = f"Usage: python {sys.argv[0]} [--help] | [-a <student_answers_file>] [-e <expected_answers_csv>] [-p <all|none|marks|solutions>]"
# what levenshtein ratio is close enough for word matches?
MIN_RATIO = 0.85
# code for levenshtein_ratio taken from
# https://www.datacamp.com/community/tutorials/fuzzy-string-python
def levenshtein_ratio(s, t, ratio_calc=True):
""" levenshtein_ratio:
Calculates levenshtein distance between two strings.
If ratio_calc = True, the function computes the
levenshtein distance ratio of similarity between two strings
For all i and j, distance[i,j] will contain the Levenshtein
distance between the first i characters of s and the
first j characters of t
"""
rows = len(s) + 1
cols = len(t) + 1
distance = np.zeros((rows, cols), dtype=int)
for i in range(1, rows):
for k in range(1, cols):
distance[i][0] = i
distance[0][k] = k
for col in range(1, cols):
for row in range(1, rows):
if s[row - 1] == t[col - 1]:
cost = 0
else:
if ratio_calc:
cost = 2
else:
cost = 1
distance[row][col] = min(distance[row - 1][col] + 1,
distance[row][col - 1] + 1,
distance[row - 1][col - 1] + cost)
if ratio_calc:
Ratio = ((len(s) + len(t)) - distance[row][col]) / (len(s) + len(t))
return Ratio
else:
return "The strings are {} edits away".format(distance[row][col])
def remove_comments(s):
i = s.find("//")
if i >= 0:
return s[0:i].strip()
return s.strip()
def no_blanks(s):
return "" != s
def matches(expected, actual):
ratio = levenshtein_ratio(expected.lower(), actual.lower())
return ratio > MIN_RATIO
def extract_answers(number, expecteds, all_answers):
s = fr'{number}\s*(.+)'
p = re.compile(s)
for answer in all_answers:
m = p.match(answer)
if m:
actual = m.group(1)
for expected in expecteds:
result = matches(expected, actual)
if result:
return (result, expected, actual)
return (False, expecteds[0], actual)
return (False, expecteds[0], None)
def parse(args):
# default file names
student_answers = 'answer.txt'
expected_answers = 'expected.csv'
output_opt = 'all'
options, remainder = getopt.getopt(args, 'ha:e:p:', ['help', 'answers=', 'expected=', 'print='])
for opt, arg in options:
if opt in ('-h', '--help'):
print(USAGE)
sys.exit()
elif opt in ('-a', '--answers'):
student_answers = arg
elif opt in ('-e', '--expected'):
expected_answers = arg
elif opt in ('-p', '--print'):
output_opt = arg
else:
print(USAGE)
sys.exit()
return (student_answers, expected_answers, output_opt)
def main(args):
student_answers, expected_answers, output_opt = parse(args)
if not os.path.exists(expected_answers):
raise Exception(f"Could not find file '{expected_answers}'")
else:
with open(expected_answers) as f:
reader = csv.reader(f)
expected = {
rows[0]: list(filter(no_blanks, map(str.strip, rows[1:])))
for rows in reader
}
if not os.path.exists(student_answers):
raise Exception(f"Could not find file '{student_answers}'")
else:
with open(student_answers) as f:
answers = list(filter(no_blanks, map(remove_comments, f.readlines())))
results = {k: extract_answers(k, v, answers) for k, v in expected.items()}
count_right = 0
count_total = 0
percent = 0
with open('DEBUG', 'w') as f:
for k, v in results.items():
if v[0]:
count_right = count_right + 1
count_total = count_total + 1
if output_opt in ('all', 'marks', 'solutions'):
print(f"{k} {'Correct.' if v[0] else 'Incorrect.'} ", file=f, end='')
if output_opt in ('all', 'solutions'):
print(f"Expected '{v[1]}'. ", file=f, end='')
if output_opt in ('all', 'solutions', 'marks'):
print(f"Was '{v[2]}'.", file=f, end='')
if output_opt in ('all', 'solutions', 'marks'):
print('', file=f)
percent = count_right * 100 // count_total
if output_opt in ('all', 'marks', 'solutions'):
print(f"{count_right} out of {count_total} correct. Your score is {percent}%.", file=f)
with open('OUTPUT', 'w') as f:
print(percent, file=f)
if __name__ == "__main__":
main(sys.argv[1:])