forked from abejburton/bdp-rideshare
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request abejburton#4 from abejburton/eda_ridhi
EDA for geospatial big data in BigQuery
- Loading branch information
Showing
4 changed files
with
2,548 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
#!/bin/bash | ||
|
||
PROJECT_ID="msca-bdp-student-ap" | ||
DATASET_NAME="chicago_rideshare" | ||
BUCKET_NAME="msca-bdp-student-gcs" | ||
FOLDER_PATH="bdp-rideshare-project/rideshare/processed_data" | ||
|
||
# Specify the folder names containing multiple CSV files | ||
FOLDERS=("program_area_2020.csv" | ||
"program_area_time_rides_2018.csv" | ||
"program_area_time_rides_2019.csv" | ||
"program_area_time_rides_2021.csv" | ||
"program_area_time_rides_2022.csv" | ||
"rides_2018.csv" | ||
"rides_2019.csv" | ||
"rides_2020.csv" | ||
"rides_2021.csv" | ||
"rides_2022.csv") | ||
|
||
# Load data into BigQuery for the specified folder | ||
load_into_bigquery() { | ||
local TABLE_NAME="$1" | ||
local FOLDER_NAME="$2" | ||
|
||
# Construct the GCS file path for CSV files within the folder | ||
GCS_FOLDER_PATH="gs://$BUCKET_NAME/$FOLDER_PATH/$FOLDER_NAME/*.csv" | ||
echo "GCS Folder Path: $GCS_FOLDER_PATH" | ||
|
||
# Append all CSV files from the folder into the BigQuery table | ||
bq load --autodetect --source_format=CSV --noreplace "$PROJECT_ID:$DATASET_NAME.$TABLE_NAME" "$GCS_FOLDER_PATH" | ||
} | ||
|
||
# Load data into BigQuery | ||
for FOLDER in "${FOLDERS[@]}"; do | ||
TABLE=$(basename "$FOLDER" .csv) | ||
load_into_bigquery "$TABLE" "$FOLDER" | ||
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,225 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"id": "5e5fe419-6a99-4aa5-a4b4-c14f0c8038ec", | ||
"metadata": {}, | ||
"source": [ | ||
"**Execute createTables.sh to create BigQuery tables of processed data**" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"id": "24c29de1-76dd-4574-ac41-f702016a48ed", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"GCS Folder Path: gs://msca-bdp-student-gcs/bdp-rideshare-project/rideshare/processed_data/program_area_2020.csv/*.csv\n" | ||
] | ||
}, | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"Waiting on bqjob_ree5454c8f84a40a_0000018c1c612c1d_1 ... (4s) Current status: DONE \n" | ||
] | ||
}, | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"GCS Folder Path: gs://msca-bdp-student-gcs/bdp-rideshare-project/rideshare/processed_data/program_area_time_rides_2018.csv/*.csv\n" | ||
] | ||
}, | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"Waiting on bqjob_r1fb15f9391984c2d_0000018c1c6147a8_1 ... (3s) Current status: DONE \n" | ||
] | ||
}, | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"GCS Folder Path: gs://msca-bdp-student-gcs/bdp-rideshare-project/rideshare/processed_data/program_area_time_rides_2019.csv/*.csv\n" | ||
] | ||
}, | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"Waiting on bqjob_r127d33cec7d892f4_0000018c1c615e7b_1 ... (3s) Current status: DONE \n" | ||
] | ||
}, | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"GCS Folder Path: gs://msca-bdp-student-gcs/bdp-rideshare-project/rideshare/processed_data/program_area_time_rides_2021.csv/*.csv\n" | ||
] | ||
}, | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"Waiting on bqjob_r91ea67eb753b684_0000018c1c617585_1 ... (3s) Current status: DONE \n" | ||
] | ||
}, | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"GCS Folder Path: gs://msca-bdp-student-gcs/bdp-rideshare-project/rideshare/processed_data/program_area_time_rides_2022.csv/*.csv\n" | ||
] | ||
}, | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"Waiting on bqjob_r3ff4aa25be604e5f_0000018c1c618c92_1 ... (4s) Current status: DONE \n" | ||
] | ||
}, | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"GCS Folder Path: gs://msca-bdp-student-gcs/bdp-rideshare-project/rideshare/processed_data/rides_2018.csv/*.csv\n" | ||
] | ||
}, | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"Waiting on bqjob_r33cd8011d68c1058_0000018c1c61a789_1 ... (14s) Current status: DONE \n" | ||
] | ||
}, | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"GCS Folder Path: gs://msca-bdp-student-gcs/bdp-rideshare-project/rideshare/processed_data/rides_2019.csv/*.csv\n" | ||
] | ||
}, | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"Waiting on bqjob_r24568683d6f2a722_0000018c1c61eb11_1 ... (134s) Current status: DONE \n" | ||
] | ||
}, | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"GCS Folder Path: gs://msca-bdp-student-gcs/bdp-rideshare-project/rideshare/processed_data/rides_2020.csv/*.csv\n" | ||
] | ||
}, | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"Waiting on bqjob_r123d602acce547c9_0000018c1c640291_1 ... (23s) Current status: DONE \n" | ||
] | ||
}, | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"GCS Folder Path: gs://msca-bdp-student-gcs/bdp-rideshare-project/rideshare/processed_data/rides_2021.csv/*.csv\n" | ||
] | ||
}, | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"Waiting on bqjob_r55d21b6694e2e603_0000018c1c646673_1 ... (23s) Current status: DONE \n" | ||
] | ||
}, | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"GCS Folder Path: gs://msca-bdp-student-gcs/bdp-rideshare-project/rideshare/processed_data/rides_2022.csv/*.csv\n" | ||
] | ||
}, | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"Waiting on bqjob_r46ac7ffca55cbb3b_0000018c1c64ca21_1 ... (22s) Current status: DONE \n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"%%bash\n", | ||
"\n", | ||
"bash \"bdp-rideshare/createTables.sh\"\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "fb8ef954-1efa-43d9-aef7-e4186daf61fe", | ||
"metadata": {}, | ||
"source": [ | ||
"**Create table for community areas stored in newline-delimited GeoJSON format**" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"id": "8d4c7418-8481-427c-8097-6e7dae4058d4", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Waiting on bqjob_r58ef9bd81b81ca11_0000018c1c652db0_1 ... (2s) Current status: DONE \n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"!bq load \\\n", | ||
" --source_format=NEWLINE_DELIMITED_JSON \\\n", | ||
" --json_extension=GEOJSON \\\n", | ||
" --autodetect \\\n", | ||
" chicago_rideshare.community_areas \\\n", | ||
" gs://msca-bdp-student-gcs/bdp-rideshare-project/neighborhoods/geojson/community_nl.geojsonl" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "8b5d91e7-5a05-4e61-8931-f593a49ba159", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.8.15" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
Oops, something went wrong.