forked from abejburton/bdp-rideshare
-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_upload.py
42 lines (31 loc) · 1.42 KB
/
data_upload.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
from google.cloud import storage
import json
import itertools
import pandas as pd
from sodapy import Socrata
# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
# client = Socrata("data.cityofchicago.org", None)
# Example authenticated client (needed for non-public datasets):
client = Socrata("data.cityofchicago.org",
"uxnrv7ZjS2c3jlrgMaXw9cNko",
username="[email protected]",
password="js9@5x#H9@wp2#Y")
# .get only returns the first 100k files
# results = client.get("m6dm-c72p")
# this is a generator - need to iterate somehow but this will paginate through all the data
results = client.get_all("nimj-3ivp")
# first_five = list(itertools.islice(items, 5))
# Initialize a client
storage_client = storage.Client()
# Define your Google Cloud Storage bucket and file path
file_path = '/bdp-rideshare-project/rideshare.json'
### test out this path with just a small amount of data
### in jupyter notebook see what data is generated by itertools and make sure the final output is json data
### json data is what will work with the rest of the code
# Convert the JSON object to a JSON string
json_data = json.dumps(results)
# Write the JSON data to Cloud Storage
with storage_client.bucket("msca-bdp-student-gcs").blob(file_path).open("w") as file:
file.write(json_data)
print("JSON data written to Cloud Storage")