forked from abejburton/bdp-rideshare
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
7bb3c42
commit 32c51c3
Showing
1 changed file
with
9 additions
and
17 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,36 +4,28 @@ | |
import pandas as pd | ||
from sodapy import Socrata | ||
|
||
# Unauthenticated client only works with public data sets. Note 'None' | ||
# in place of application token, and no username or password: | ||
# client = Socrata("data.cityofchicago.org", None) | ||
|
||
# Example authenticated client (needed for non-public datasets): | ||
## Retrieve Data From Online Portal | ||
# Authenticate API client: | ||
client = Socrata("data.cityofchicago.org", | ||
"uxnrv7ZjS2c3jlrgMaXw9cNko", | ||
username="[email protected]", | ||
password="js9@5x#H9@wp2#Y") | ||
# results is a generator | ||
results = client.get_all("m6dm-c72p") | ||
# select all indices of the generator to make a list of dicts | ||
result_list = list(itertools.islice(results, 299602996)) | ||
|
||
# .get only returns the first 100k files | ||
# results = client.get("m6dm-c72p") | ||
|
||
# this is a generator - need to iterate somehow but this will paginate through all the data | ||
results = client.get_all("nimj-3ivp") | ||
# first_five = list(itertools.islice(items, 5)) | ||
|
||
## Move Data to GCS | ||
# Initialize a client | ||
storage_client = storage.Client() | ||
|
||
# Define your Google Cloud Storage bucket and file path | ||
file_path = '/bdp-rideshare-project/rideshare.json' | ||
|
||
|
||
### test out this path with just a small amount of data | ||
### in jupyter notebook see what data is generated by itertools and make sure the final output is json data | ||
### json data is what will work with the rest of the code | ||
file_path = 'bdp-rideshare-project/rideshare/rideshare.json' | ||
|
||
# Convert the JSON object to a JSON string | ||
json_data = json.dumps(results) | ||
json_data = json.dumps(result_list) | ||
|
||
# Write the JSON data to Cloud Storage | ||
with storage_client.bucket("msca-bdp-student-gcs").blob(file_path).open("w") as file: | ||
|