-
Notifications
You must be signed in to change notification settings - Fork 1
/
parser.py
executable file
·254 lines (225 loc) · 8.5 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
#!/usr/bin/env python2
from sys import argv
import re
#filename = argv[1]
options = ['Ae', 'An', 'ACM', 'AM', 'APh', 'Art', 'Ay', 'BMB', 'BE', 'Bi',
'BEM', 'Ch', 'CHE', 'CDS', 'CNS', 'CS', 'CE', 'Ec', 'ESL', 'EE', 'E', 'EN',
'ESE', 'F', 'FS', 'Ge', 'H', 'HPS', 'Hum', 'L', 'Law', 'Ma', 'MS', 'ME',
'Mu', 'Pl', 'PE', 'Ph', 'PS', 'PA', 'Psy', 'SS', 'SA', 'Wr']
types = ['course_name', 'section number', 'units', 'prof name', 'days/time',
'location', 'grade scheme', 'annotation', 'unsure']
state = 'course'
cur_option = ""
cur_number = 0
cur_name = ""
cur_units = []
cur_class = ""
cur_section = ""
#with open(filename, "r") as f:
#lines = f.readlines()
#for line in lines:
#line = " ".join(line.split())
## Are we looking at an option?
#option_line = False
#for opt in options:
#m = re.match(opt, line)
#if (m):
## We have a potential option at the beginning of our line, so we
## will insert a space between any occurrence of letter-number
#match = re.search('[A-Za-z][0-9]', line)
#if (match):
## We need to separate letter-number now
#first_half = line[0: match.start() + 1]
#second_half = line[match.end() - 1:]
#newline = first_half + ' ' + second_half
## Now that we have separated letter and number, we can tokenize
## the string into 2 list items, hopefully the option and the
## course number
#tok_line = newline.split()
#if len(tok_line) != 2:
## Must contain two tokens (option, course_number) to be
## option line
#break
#if not (re.match('[0-9]+[A-Za-z]*', tok_line[1])):
## Must have a course number matching the above regex
#break
#option_line = True
#if option_line:
#break
#if state == "course":
## Course state will handle option, number, name, and units.
##tok_line = line.split()
#if option_line:
#cur_option = tok_line[0]
#cur_number = tok_line[1]
#print "Option: %s, Number: %s" % (cur_option, cur_number)
#elif (len(tok_line) > 1):
## Assuming all courses have a name longer than one word.
#cur_name = " ".join(tok_line)
#print cur_name
#else:
## Token line has 1 "word" only, this means units
#cur_units = tok_line[0].split("-")
#print cur_units
#option_line = False
#elif state == "section":
#""" section will handle professor first and last names, subtitle,
#annotations, fixed time, times, days, locations, and grades. """
#pass
#types = ['course_name', 'course title', 'section number', 'units', 'prof
#name', 'days/time', 'location', 'grade scheme', 'annotation', 'unsure']
def step3(current_type, current_line):
type_name = current_type.NAME
if type_name == 'unsure':
return # and skip to step 5
'''
Notes for step 3 and beyond:
Guesses:
0. Course name: the two tokens it is divided into should be read as the
option and the course number.
1. Course title: however many tokens it takes to form the title of the
course.
2. Section number: the one token it is read as is the section number.
3. Units: we parse it into three tokens and construct a list out of
them.
4. Professor name: we remove the comma from the first name and read the
professor's first name and last name. We will put professor names into a
list in step 3.
5. Days/time: Divide into two tokens, associate the days with the hours
on those days. A dictionary perhaps?
6. Location: Initial parsing should suffice for this.
7. Grade scheme: PASS-FAIL or GRADES.
8. Annotation: something like "lottery only..."
9. Unsure. Prompt user for manual input.
'''
def e_0(tokens, current_line):
''' Process the initial tokens, guessing that they describes a course
name.
'''
return tokens, current_line
def e_1(tokens, current_line):
''' Process the initial tokens, guessing that they describe a course
title.
'''
return tokens, current_line
def e_2(tokens, current_line):
''' Process the initial parsing with the guess that we have a section
number. '''
return tokens, current_line
def e_3(tokens, current_line):
''' Process the result of the initial parsing with the guess that we
have a units line.
'''
return (current_line.split('-'), current_line)
def e_4(tokens, current_line):
''' Process the initial tokens with the guess that we are processing
professor name(s).
'''
profs = []
cur_prof = []
name_end = False
for tok in tokens:
if name_end:
cur_prof.append(tok)
profs.append(cur_prof)
cur_prof = []
name_end = False
elif tok[-1] == ',':
name_end = True
cur_prof.append(tok[:-1])
else:
cur_prof.append(tok)
return (profs, current_line)
def e_5(tokens, current_line):
''' Process the initial tokens with the guess that we are processing
day/times.
Sample input: ['MWF', '11:30', '-', '13:00']
Sample output: [[1, 3, 5], 11.5, 13]
'''
day_to_num = {'M': 1, 'T': 2, 'W': 3, 'R': 4, 'F': 5}
days = tokens[0]
day_list = []
for d in days:
day_list.append(day_to_num[d])
start_time, end_time = filter(lambda s: len(s) > 1, tokens[1:])
# Gives us filtered = ['11:30', '13:00']
def processTime(t):
[left, right] = t.split(':')
if right == '00':
return left
return left + str(int(right) / 60.0)[1:]
output = [day_list]
output.append(processTime(start_time))
output.append(processTime(end_time))
return output
def e_6(tokens, current_line):
''' Further parse the initial parsing, guessing that we have a location.
'''
return tokens, current_line
def e_7(tokens, current_line):
''' Further parse the initial parsing, guessing that we have a grade
scheme.
'''
return tokens, current_line
def e_8(tokens, current_line):
''' Improve on the initial parsing, guessing that we have an annotation.
'''
return tokens, current_line
def f_0(tokens, string):
# Return true if tokens match course name, false otherwise
if len(tokens) == 2:
opt = tokens[0]
number = tokens[1]
if opt + number == string or opt + ' ' + number == string:
# Our tokens, when combined (modulo space) should give us back
# our original string.
if opt in options:
# Final check for opt being a valid option
return True
return False
def f_1(tokens, string):
# Return true if tokens match section number
if len(tokens) == 1 and re.match('^[0-9]{1,2}$', tokens[0]):
if tokens[0] == string:
return True
return False
def f_2(tokens, string):
# Return true if tokens match units
if len(tokens) == 3:
for tok in tokens:
if not re.match('1?[0-9]', tok) or not tok in string:
return False
return True
def flatten_level_one_list(xs):
return [a for b in xs for a in b]
def f_3(tokens, string):
''' Return true if tokens match professor name(s). The tokens we receive
from step 3 will be a list of professors. For example, [['Vanier',
'M'], ['Pinkston', 'D']].
'''
# Each comma in a professor string should be a separate name
if len(tokens) == string.count(','):
s = string.split(' ')
# Remove the "/" character from the original string
simple_parse = filter(lambda w: re.match('[A-Za-z]', w), s)
delete_commas = map(lambda w: w.replace(",", ""), simple_parse)
return delete_commas == flatten_level_one_list(tokens)
return False
def f_4(tokens, string):
# Check if tokens match days/time
if len(tokens) == 4:
if match('[MWTRF]+', tokens[0]):
if match(r'([0-9]|0[0-9]|1[0-9]|2[0-3])\.?[0-9]?', tokens[1]) and match(r'([0-9]|0[0-9]|1[0-9]|2[0-3])\.?[0-9]?', tokens[3]):
return True
return False
name_tokens = ['Von', 'Lewen,', 'M', 'Pinkston', 'Lloyd,', 'D']
name_string = 'Von Lewen, M / Pinkston Lloyd, D'
daytime_tokens = ['MWF', '11:30', '-', '13:00']
daytime_string = 'MWF 11:30 - 13:00'
def main():
print name_string
print e_4(name_tokens, name_string)
print daytime_string
print e_5(daytime_tokens, daytime_string)
if __name__ == '__main__':
main()