diff --git a/data_upload.py b/data_upload.py index e0bc726..59a1a91 100644 --- a/data_upload.py +++ b/data_upload.py @@ -4,36 +4,28 @@ import pandas as pd from sodapy import Socrata -# Unauthenticated client only works with public data sets. Note 'None' -# in place of application token, and no username or password: -# client = Socrata("data.cityofchicago.org", None) -# Example authenticated client (needed for non-public datasets): +## Retrieve Data From Online Portal +# Authenticate API client: client = Socrata("data.cityofchicago.org", "uxnrv7ZjS2c3jlrgMaXw9cNko", username="abeburton@me.com", password="js9@5x#H9@wp2#Y") +# results is a generator +results = client.get_all("m6dm-c72p") +# select all indices of the generator to make a list of dicts +result_list = list(itertools.islice(results, 299602996)) -# .get only returns the first 100k files -# results = client.get("m6dm-c72p") - -# this is a generator - need to iterate somehow but this will paginate through all the data -results = client.get_all("nimj-3ivp") -# first_five = list(itertools.islice(items, 5)) +## Move Data to GCS # Initialize a client storage_client = storage.Client() # Define your Google Cloud Storage bucket and file path -file_path = '/bdp-rideshare-project/rideshare.json' - - -### test out this path with just a small amount of data -### in jupyter notebook see what data is generated by itertools and make sure the final output is json data -### json data is what will work with the rest of the code +file_path = 'bdp-rideshare-project/rideshare/rideshare.json' # Convert the JSON object to a JSON string -json_data = json.dumps(results) +json_data = json.dumps(result_list) # Write the JSON data to Cloud Storage with storage_client.bucket("msca-bdp-student-gcs").blob(file_path).open("w") as file: