-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathzat_parser.py
186 lines (164 loc) · 6.06 KB
/
zat_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
"""
Authors: Zack Dulac, Trung Nguyen, Abby Tse
Date: 22 October 2018
FILE: skeleton_parser.py
------------------
Skeleton parser for programming project 1. Has useful imports and
functions for parsing, including:
1) Directory handling -- the parser takes a list of eBay json files
and opens each file inside of a loop. You just need to fill in the rest.
2) Dollar value conversions -- the json files store dollar value amounts in
a string like $3,453.23 -- we provide a function to convert it to a string
like XXXXX.xx.
3) Date/time conversions -- the json files store dates/ times in the form
Mon-DD-YY HH:MM:SS -- we wrote a function (transformDttm) that converts to the
for YYYY-MM-DD HH:MM:SS, which will sort chronologically in SQL.
Your job is to implement the parseJson function, which is invoked on each file by
the main function. We create the initial Python dictionary object of items for
you; the rest is up to you!
Happy parsing!
"""
import sys
from json import loads
from re import sub
columnSeparator = "|"
# Dictionary of months used for date transformation
MONTHS = {'Jan':'01','Feb':'02','Mar':'03','Apr':'04','May':'05','Jun':'06',\
'Jul':'07','Aug':'08','Sep':'09','Oct':'10','Nov':'11','Dec':'12'}
"""
Returns true if a file ends in .json
"""
def isJson(f):
return len(f) > 5 and f[-5:] == '.json'
"""
Converts month to a number, e.g. 'Dec' to '12'
"""
def transformMonth(mon):
if mon in MONTHS:
return MONTHS[mon]
else:
return mon
"""
Transforms a timestamp from Mon-DD-YY HH:MM:SS to YYYY-MM-DD HH:MM:SS
"""
def transformDttm(dttm):
dttm = dttm.strip().split(' ')
dt = dttm[0].split('-')
date = '20' + dt[2] + '-'
date += transformMonth(dt[0]) + '-' + dt[1]
return date + ' ' + dttm[1]
"""
Transform a dollar value amount from a string like $3,453.23 to XXXXX.xx
"""
def transformDollar(money):
if money == None or len(money) == 0 or money == "NULL":
return money
return sub(r'[^\d.]', '', money)
def escapeQuote(string):
if string == None:
return string
return '\"' + sub(r'\"','\"\"',string) + '\"'
"""
Schema of Item table is
Items (ItemID, SellerID, Name, Buy_Price, First_Bid, Currently,
Number_of_Bids, Started, Ends, Description)
"""
def parseItem(dictionary):
with open("items.dat", "a") as f:
item = []
item.append(dictionary["ItemID"])
item.append(dictionary["Seller"]["UserID"])
item.append(escapeQuote(dictionary["Name"]))
item.append(transformDollar(dictionary.get("Buy_Price", "NULL")))
item.append(transformDollar(dictionary["First_Bid"]))
item.append(transformDollar(dictionary["Currently"]))
item.append(dictionary["Number_of_Bids"])
item.append(transformDttm(dictionary["Started"]))
item.append(transformDttm(dictionary["Ends"]))
item.append(escapeQuote(dictionary["Description"]))
f.write("|".join(map(lambda s: s or "", item)))
f.write("\n")
"""
Schema of User table is
User (UserID, Rating, Location, Country)
"""
def parseUser(dictionary):
with open("users.dat", "a") as f:
bids = dictionary.get("Bids")
users = set()
if bids != None:
for bid in bids:
info = []
bidder = bid["Bid"]["Bidder"]
info.append(bidder["UserID"])
info.append(bidder["Rating"])
info.append(escapeQuote(bidder.get("Location", "NULL")))
info.append(escapeQuote(bidder.get("Country", "NULL")))
users.add("|".join(info))
seller = []
seller.append(dictionary["Seller"]["UserID"])
seller.append(dictionary["Seller"]["Rating"])
seller.append(escapeQuote(dictionary.get("Location", "NULL")))
seller.append(escapeQuote(dictionary.get("Country", "NULL")))
users.add("|".join(seller))
f.write("\n".join(users))
f.write("\n")
"""
Schema of Categories table is
Categories (ItemID, Category)
"""
def parseCategory(dictionary):
with open("category.dat", "a") as f:
category = ["|".join([dictionary["ItemID"], c]) \
for c in dictionary.get("Category")]
f.write("\n".join(category))
f.write("\n")
"""
Schema of Bids table is
Bids (ItemID, UserID, Time, Amount)
"""
def parseBids(dictionary):
with open("bids.dat", "a") as f:
bids = dictionary.get("Bids")
if bids != None:
for bid in bids:
info = []
info.append(dictionary["ItemID"])
info.append(bid["Bid"]["Bidder"]["UserID"])
info.append(transformDttm(bid["Bid"]["Time"]))
info.append(transformDollar(bid["Bid"]["Amount"]))
f.write("|".join(info) + "\n")
"""
Parses a single json file. Currently, there's a loop that iterates over each
item in the data set. Your job is to extend this functionality to create all
of the necessary SQL tables for your database.
"""
def parseJson(json_file):
with open(json_file, 'r') as f:
items = loads(f.read())['Items']
# creates a Python dictionary of Items for the supplied json file
for item in items:
"""
Traverse the items dictionary to extract information from the
given `json_file' and generate the necessary .dat files to generate
the SQL tables based on your relation design
"""
parseItem(item)
parseUser(item)
parseCategory(item) #Shellscript will overwrite this to categoryOf.dat and use the output redirection operator and awk command to create category.dat to follow E/R Model
parseBids(item)
"""
Loops through each json files provided on the command line and passes each file
to the parser
"""
def main(argv):
if len(argv) < 2:
print >> sys.stderr, 'Usage: python skeleton_json_parser.py <path to json files>'
sys.exit(1)
# loops over all .json files in the argument
for f in argv[1:]:
if isJson(f):
parseJson(f)
print ("Success parsing " + f)
if __name__ == '__main__':
main(sys.argv)