From 0211656438479bee8d2ae596796e7a4a85eb7a89 Mon Sep 17 00:00:00 2001 From: Aimee Barciauskas Date: Mon, 7 Nov 2022 09:56:02 -0800 Subject: [PATCH] Add notebooks for EIS Fire data (#10) * Add notebooks for fetching data from S3 and inserting into the features database * Add annotation Co-authored-by: j08lue --- .gitignore | 2 + notebooks/add_fire_features.ipynb | 249 ++++++++++++++++++++++++++++++ notebooks/env.example | 6 + notebooks/fetch_fire_data.ipynb | 191 +++++++++++++++++++++++ notebooks/requirements.txt | 1 + 5 files changed, 449 insertions(+) create mode 100644 notebooks/add_fire_features.ipynb create mode 100755 notebooks/env.example create mode 100644 notebooks/fetch_fire_data.ipynb create mode 100644 notebooks/requirements.txt diff --git a/.gitignore b/.gitignore index db827a5..b5bd985 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,5 @@ cdk.out .idea .env +.ipynb_checkpoints +data/ diff --git a/notebooks/add_fire_features.ipynb b/notebooks/add_fire_features.ipynb new file mode 100644 index 0000000..a1aeef5 --- /dev/null +++ b/notebooks/add_fire_features.ipynb @@ -0,0 +1,249 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "fc060b63", + "metadata": {}, + "source": [ + "# Ingest fire features" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "1ea8ffd5", + "metadata": {}, + "outputs": [], + "source": [ + "import geopandas\n", + "import geoalchemy2\n", + "from sqlalchemy import create_engine\n", + "import os\n", + "import boto3\n", + "import json" + ] + }, + { + "cell_type": "markdown", + "id": "6a44355b", + "metadata": {}, + "source": [ + "## Manual method" + ] + }, + { + "cell_type": "markdown", + "id": "b5308956", + "metadata": {}, + "source": [ + "### List locally stored FlatGeobuf files" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "caeca9f7", + "metadata": {}, + "outputs": [], + "source": [ + "table_prefix = 'eis_fire'\n", + "data_dir = 'data/eis_fire'\n", + "years = ['2019', '2020']" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "0655c2e0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['data/eis_fire/2019/newfirepix.fgb',\n", + " 'data/eis_fire/2019/perimeter.fgb',\n", + " 'data/eis_fire/2019/fireline.fgb',\n", + " 'data/eis_fire/2020/newfirepix.fgb',\n", + " 'data/eis_fire/2020/perimeter.fgb',\n", + " 'data/eis_fire/2020/fireline.fgb']" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "files_to_insert = []\n", + "for year in years:\n", + " directory = f\"{data_dir}/{year}\"\n", + " files = os.listdir(directory)\n", + " for file in files:\n", + " files_to_insert.append(f\"{directory}/{file}\")\n", + "\n", + "files_to_insert" + ] + }, + { + "cell_type": "markdown", + "id": "2920b0e4", + "metadata": {}, + "source": [ + "### Get database configuration from vault" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "42c477a0", + "metadata": {}, + "outputs": [], + "source": [ + "client = boto3.client('secretsmanager')\n", + "response = client.get_secret_value(SecretId='OMITTED')" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "82ed63e3", + "metadata": {}, + "outputs": [], + "source": [ + "secrets = json.loads(response['SecretString'])\n", + "host = secrets['host']\n", + "password = secrets['password']\n", + "username = secrets['username']\n", + "database = secrets['dbname']" + ] + }, + { + "cell_type": "markdown", + "id": "1ae3533f", + "metadata": {}, + "source": [ + "### Run `ogr2ogr` on files" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "6de7d13b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "inserting data/eis_fire/2020/fireline.fgb into eis_fire_fireline\n" + ] + } + ], + "source": [ + "file_to_insert = files_to_insert[5]\n", + "tablename = f\"{table_prefix}_{file_to_insert.split('/')[-1].split('.')[0]}\"\n", + "print(f\"inserting {file_to_insert} into {tablename}\")\n", + "connection_string = f\"'host={host} dbname={database} user={username} password={password}'\"\n", + "!ogr2ogr -f \"PostgreSQL\" PG:{connection_string} -t_srs EPSG:4326 {file_to_insert} -nln {tablename} -append" + ] + }, + { + "cell_type": "markdown", + "id": "84b7ec5e", + "metadata": {}, + "source": [ + "## Attempt at " + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "d787fbf2", + "metadata": {}, + "outputs": [], + "source": [ + "# Check the number of rows against the database insertion" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "5775b0e8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Input files for newfirepix have 44488 rows combined.\n", + "Input files for perimeter have 44567 rows combined.\n", + "Input files for fireline have 43661 rows combined.\n" + ] + } + ], + "source": [ + "tables = ['newfirepix', 'perimeter', 'fireline']\n", + "# check number of rows in data frames\n", + "for table in tables:\n", + " input_files = [f\"{data_dir}/{year}/{table}.fgb\" for year in years]\n", + " # count rows in both inputs from 2019 and 2020\n", + " rows = 0\n", + " for file in input_files:\n", + " df = geopandas.read_file(file)\n", + " rows += df.shape[0]\n", + " print(f\"Input files for {table} have {rows} rows combined.\")\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "f2b9f5f4", + "metadata": {}, + "outputs": [], + "source": [ + "# this results in an internal server error when trying to load items in the VEDA Features API\n", + "# File \"pydantic/main.py\", line 342, in pydantic.main.BaseModel.__init__\n", + "#pydantic.error_wrappers.ValidationError: 10 validation errors for FeatureCollection\n", + "#features -> 0 -> id\n", + " # str type expected (type=type_error.str)\n", + "#features -> 1 -> id\n", + "#df_reproj.to_postgis(tablename, engine, index=False, if_exists=\"append\")" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "525c4eb8", + "metadata": {}, + "outputs": [], + "source": [ + "# from sqlalchemy import text\n", + "# engine = create_engine(f\"postgresql://{username}:{password}@{host}:5432/{database}\") \n", + "# sql = text('DROP TABLE IF EXISTS fire_boundaries3;')\n", + "# result = engine.execute(sql)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/env.example b/notebooks/env.example new file mode 100755 index 0000000..eb0c483 --- /dev/null +++ b/notebooks/env.example @@ -0,0 +1,6 @@ +IDENTITY_POOL_ID=us-west-2:XXX +USER_POOL_ID=us-west-XXX +CLIENT_ID=XXX +USERNAME=XXX +PASSWORD=XXX + diff --git a/notebooks/fetch_fire_data.ipynb b/notebooks/fetch_fire_data.ipynb new file mode 100644 index 0000000..cb3a4e2 --- /dev/null +++ b/notebooks/fetch_fire_data.ipynb @@ -0,0 +1,191 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e2642530", + "metadata": {}, + "source": [ + "# Fetche EIS Fire files using Cognito" + ] + }, + { + "cell_type": "markdown", + "id": "e08b576d", + "metadata": {}, + "source": [ + "## Create AWS S3 Short Term Access Credentials" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "8080299b", + "metadata": {}, + "outputs": [], + "source": [ + "import boto3\n", + "from cognito_client import CognitoClient" + ] + }, + { + "cell_type": "markdown", + "id": "dacf3a5e", + "metadata": {}, + "source": [ + "## Initiate the CognitoClient\n", + "\n", + "Note: You will either need to pass the appropriate Cognito Identity Pool ID, User Pool Id, and Client ID to the instantiation of `CognitoClient()` or create an `.env` with the following content" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "d3f4b857", + "metadata": {}, + "outputs": [], + "source": [ + "client = CognitoClient()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "7f74084c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "········\n" + ] + } + ], + "source": [ + "_ = client.login()" + ] + }, + { + "cell_type": "markdown", + "id": "2481586e", + "metadata": {}, + "source": [ + "## Use AWS short-term access credentials to call AWS via the CLI\n", + "\n", + "You can now use the AWS CLI with credentials generated by the following code:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "e88797ea", + "metadata": {}, + "outputs": [], + "source": [ + "creds = client.get_aws_credentials()" + ] + }, + { + "cell_type": "markdown", + "id": "2d11ae16", + "metadata": {}, + "source": [ + "## Use boto3 for AWS operations\n", + "\n", + "You can use boto3 for AWS operations. At this time the roles associated with Cognito identities are associated with S3 access policies, so you will want to refer to the [S3 Boto3 Docs](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html).\n", + "\n", + "Below is an example for how to put and remove a test file." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "d2a85f4a", + "metadata": {}, + "outputs": [], + "source": [ + "session = boto3.Session(aws_access_key_id=creds[\"AccessKeyId\"], \n", + " aws_secret_access_key=creds[\"SecretKey\"],\n", + " aws_session_token=creds[\"SessionToken\"])\n", + "\n", + "client = session.client('s3')" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "f747e9c2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Got a 200 status code response.\n", + "[{'key': 'EIS/other/feds-WesternUS/2019/Largefire/00-latest/fireline.fgb', 'size': 43995344}, {'key': 'EIS/other/feds-WesternUS/2019/Largefire/00-latest/newfirepix.fgb', 'size': 5531400}, {'key': 'EIS/other/feds-WesternUS/2019/Largefire/00-latest/perimeter.fgb', 'size': 254353056}]\n", + "Downloading fireline.fgb to data/eis_fire/2019/fireline.fgb\n", + "Downloading newfirepix.fgb to data/eis_fire/2019/newfirepix.fgb\n", + "Downloading perimeter.fgb to data/eis_fire/2019/perimeter.fgb\n", + "Got a 200 status code response.\n", + "[{'key': 'EIS/other/feds-WesternUS/2020/Largefire/00-latest/fireline.fgb', 'size': 36942848}, {'key': 'EIS/other/feds-WesternUS/2020/Largefire/00-latest/newfirepix.fgb', 'size': 12485496}, {'key': 'EIS/other/feds-WesternUS/2020/Largefire/00-latest/perimeter.fgb', 'size': 196043928}]\n", + "Downloading fireline.fgb to data/eis_fire/2020/fireline.fgb\n", + "Downloading newfirepix.fgb to data/eis_fire/2020/newfirepix.fgb\n", + "Downloading perimeter.fgb to data/eis_fire/2020/perimeter.fgb\n" + ] + } + ], + "source": [ + "import os\n", + "years = ['2019', '2020']\n", + "bucket = 'veda-data-store-staging'\n", + "download_data_dir = 'data/eis_fire'\n", + "\n", + "for year in years:\n", + " prefix = f'EIS/other/feds-WesternUS/{year}/Largefire/00-latest'\n", + " year_dir = f\"{download_data_dir}/{year}\"\n", + " if not os.path.exists(year_dir):\n", + " os.makedirs(year_dir) \n", + " response = client.list_objects_v2(\n", + " Bucket=bucket,\n", + " Prefix=prefix)\n", + " response_status = response['ResponseMetadata']['HTTPStatusCode']\n", + " if response_status == 200:\n", + " print(f\"Got a {response['ResponseMetadata']['HTTPStatusCode']} status code response.\")\n", + " else:\n", + " print(f\"Got unexpected response: {response}\") \n", + " objects_to_download = [{'key': r['Key'], 'size': r['Size']} for r in response['Contents']]\n", + " print(objects_to_download)\n", + " for obj in objects_to_download:\n", + " object_key, object_size = obj.values()\n", + " if object_size >= 1e+9:\n", + " print(f'{object_key} is {object_size}. Do you really want to download a gb of data?')\n", + " else:\n", + " filename = object_key.split('/')[-1]\n", + " download_location = f\"{year_dir}/{filename}\"\n", + " print(f\"Downloading {filename} to {download_location}\")\n", + " client.download_file(bucket, object_key, download_location) " + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/requirements.txt b/notebooks/requirements.txt new file mode 100644 index 0000000..b6b8786 --- /dev/null +++ b/notebooks/requirements.txt @@ -0,0 +1 @@ +cognito_client==0.0.1 \ No newline at end of file