Update data_upload.py

rok12003 · Oct 27, 2023 · 32c51c3 · 32c51c3
1 parent 7bb3c42
commit 32c51c3
Showing 1 changed file with 9 additions and 17 deletions.
diff --git a/data_upload.py b/data_upload.py
@@ -4,36 +4,28 @@
 import pandas as pd
 from sodapy import Socrata
 
-# Unauthenticated client only works with public data sets. Note 'None'
-# in place of application token, and no username or password:
-# client = Socrata("data.cityofchicago.org", None)
 
-# Example authenticated client (needed for non-public datasets):
+## Retrieve Data From Online Portal
+# Authenticate API client:
 client = Socrata("data.cityofchicago.org",
                 "uxnrv7ZjS2c3jlrgMaXw9cNko",
                 username="[email protected]",
                 password="js9@5x#H9@wp2#Y")
+# results is a generator
+results = client.get_all("m6dm-c72p")
+# select all indices of the generator to make a list of dicts
+result_list = list(itertools.islice(results, 299602996))
 
-# .get only returns the first 100k files
-# results = client.get("m6dm-c72p")
-
-# this is a generator - need to iterate somehow but this will paginate through all the data
-results = client.get_all("nimj-3ivp")
-# first_five = list(itertools.islice(items, 5))
 
+## Move Data to GCS
 # Initialize a client
 storage_client = storage.Client()
 
 # Define your Google Cloud Storage bucket and file path
-file_path = '/bdp-rideshare-project/rideshare.json'
-
-
-### test out this path with just a small amount of data
-### in jupyter notebook see what data is generated by itertools and make sure the final output is json data
-### json data is what will work with the rest of the code
+file_path = 'bdp-rideshare-project/rideshare/rideshare.json'
 
 # Convert the JSON object to a JSON string
-json_data = json.dumps(results)
+json_data = json.dumps(result_list)
 
 # Write the JSON data to Cloud Storage
 with storage_client.bucket("msca-bdp-student-gcs").blob(file_path).open("w") as file: