-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathParser.py
182 lines (151 loc) · 6.56 KB
/
Parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
import string
from RelationalOp import *
import logging
db_logger = logging.getLogger('SimpleDB')
# SimpleDB support five types of tokens
# Single Char delimiters such as comma, period(because table.col needs to be interpreted as three tokens, table, period and column; SimpleDB doesn't do it)
# Integer constants such as 123
# String constants such as 'joe'
# Keywords, such as select, from and where
# Identifiers such as Student, X and glop_34a
class Tokenizer:
EOF = -1
DELIMITER = 0
IntConstant = 1
StringConstant = 2
Keyword = 3
Id = 4
KEYWORD_LIST = ("select", "from", "where", "and", "insert", "into", "values", "delete", "update",
"set", "create", "table", "int", "varchar", "view", "as", "index", "on")
def __init__(self, input):
self.input = input
self.pos = 0
self._tokenIndex = 0
# Keywords and identifiers are case-insensitive
def nextToken(self):
if self.pos == len(self.input):
return Tokenizer.EOF, Tokenizer.EOF
while self.input[self.pos] in string.whitespace:
self.pos += 1
continue
start = self.pos
token_type = None
token_value = None
if self.pos < len(self.input) and self.input[self.pos] in string.ascii_letters:
while self.pos < len(self.input) and self.input[self.pos] in set(string.ascii_letters + string.digits + '_'):
self.pos += 1
if self.input[start:self.pos].lower() in Tokenizer.KEYWORD_LIST:
token_type, token_value = Tokenizer.Keyword, self.input[start:self.pos].lower()
else:
token_type, token_value = Tokenizer.Id, self.input[start:self.pos]
elif self.pos < len(self.input) and self.input[self.pos] == "'":
self.pos += 1 # skip the first quote since that is not part of the string
start = self.pos # also move the start cursor
while self.pos < len(self.input) and self.input[self.pos] != "'":
self.pos += 1
token_type, token_value = Tokenizer.StringConstant, self.input[start:self.pos]
self.pos += 1 # also ignore the final quote
elif self.pos < len(self.input) and self.input[self.pos] in string.digits:
while self.pos < len(self.input) and self.input[self.pos] in string.digits:
self.pos += 1
token_type, token_value = Tokenizer.IntConstant, int(self.input[start:self.pos])
elif self.pos < len(self.input) and self.input[self.pos] in (',', '.', '=', '(', ')'):
self.pos += 1
token_type, token_value = Tokenizer.DELIMITER, self.input[start:self.pos]
self._tokenIndex += 1
return token_type, token_value
class Lexer:
def __init__(self, input_query):
self.tokenizer = Tokenizer(input_query)
self.token_type, self.token_value = self.tokenizer.nextToken()
def matchDelim(self, target_delim):
return self.token_value == target_delim
def matchIntConstant(self):
return self.token_type == Tokenizer.IntConstant
def matchStringConstant(self):
return self.token_type == Tokenizer.StringConstant
def matchKeyword(self, target_keyword):
return self.token_type == Tokenizer.Keyword and self.token_value == target_keyword
def matchId(self):
return self.token_type == Tokenizer.Id and self.token_value not in Tokenizer.KEYWORD_LIST
def eatDelim(self, target_delim):
if not self.matchDelim(target_delim):
raise Exception("Expecting delimiter " + str(target_delim))
self.token_type, self.token_value = self.tokenizer.nextToken()
# delimiters are consumed
def eatIntConstant(self):
if not self.matchIntConstant():
raise Exception("Expecting Int Constant")
temp = self.token_value
self.token_type, self.token_value = self.tokenizer.nextToken()
return temp
def eatStringConstant(self):
if not self.matchStringConstant():
raise Exception("Expecting String constant")
temp = self.token_value
self.token_type, self.token_value = self.tokenizer.nextToken()
return temp
def eatKeyword(self, target_keyword):
if not self.matchKeyword(target_keyword):
raise Exception("Expecting SQL keyword")
temp = self.token_value
self.token_type, self.token_value = self.tokenizer.nextToken()
return temp
def eatId(self):
if not self.matchId():
raise Exception("Expecting SQL Identifier")
temp = self.token_value
self.token_type, self.token_value = self.tokenizer.nextToken()
return temp
class Parser:
def __init__(self, input_query):
self.lex: Lexer = Lexer(input_query)
def field(self):
return self.lex.eatId()
def constant(self):
# TODO: internally string and int are handled no differently
if self.lex.matchStringConstant():
return Constant(self.lex.eatStringConstant())
else:
return Constant(self.lex.eatIntConstant())
def expression(self):
if self.lex.matchId():
return Expression(self.field())
else:
return Expression(self.constant())
def term(self):
temp_lhs = self.expression()
self.lex.eatDelim('=')
temp_rhs = self.expression()
return Term(temp_lhs, temp_rhs)
# predicate is one or more terms conjoined by AND operator
def predicate(self):
temp_pred = Predicate(self.term())
if self.lex.matchKeyword('and'):
self.lex.eatKeyword('and')
temp_pred.conjoinWith(self.predicate())
return temp_pred
def query(self):
self.lex.eatKeyword('select')
temp_fields = self.selectList()
self.lex.eatKeyword('from')
temp_tables = self.tableList()
temp_pred = Predicate()
if self.lex.matchKeyword('where'):
self.lex.eatKeyword('where')
temp_pred = self.predicate()
return {'fields':temp_fields, 'tables':temp_tables, 'predicate':temp_pred}
def selectList(self):
temp_list = []
temp_list.append(self.field())
while self.lex.matchDelim(','):
self.lex.eatDelim(',')
temp_list.extend(self.selectList())
return temp_list
def tableList(self):
temp_list = []
temp_list.append(self.lex.eatId()) # self.lex.eatId() is equivalent to self.field()
if self.lex.matchDelim(','):
self.lex.eatDelim(',')
temp_list.extend(self.tableList())
return temp_list