diff --git a/scripts/chloropleth.ipynb b/scripts/chloropleth.ipynb new file mode 100644 index 0000000..784888b --- /dev/null +++ b/scripts/chloropleth.ipynb @@ -0,0 +1,1578 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import requests\n", + "from zipfile import ZipFile\n", + "import pandas as pd\n", + "import geopandas as gpd\n", + "import matplotlib.pyplot as plt\n", + "from shapely.affinity import scale, translate\n", + "import matplotlib.patheffects as pe\n", + "\n", + "# Directory to store downloaded data\n", + "local_data_dir = \"./data/\"\n", + "os.makedirs(local_data_dir, exist_ok=True)\n", + "\n", + "# eGRID Data URLs\n", + "egrid_urls = {\n", + " 2019: {\n", + " \"egrid_xlsx\": 'https://www.epa.gov/sites/default/files/2021-02/egrid2019_data.xlsx',\n", + " \"subregions_kmz\": 'https://www.epa.gov/sites/default/files/2021-02/egrid2019_subregions.kmz',\n", + " },\n", + " 2020: {\n", + " \"egrid_xlsx\": 'https://www.epa.gov/system/files/documents/2022-09/eGRID2020_Data_v2.xlsx',\n", + " \"subregions_kmz\": 'https://www.epa.gov/system/files/other-files/2022-01/egrid2020_subregions.kmz',\n", + " },\n", + " 2021: {\n", + " \"egrid_xlsx\": 'https://www.epa.gov/system/files/documents/2023-01/eGRID2021_data.xlsx',\n", + " \"subregions_shp\": 'https://gaftp.epa.gov/EPADataCommons/ORD/egrid2021/eGRID2021%20Subregions%20Shapefile.zip',\n", + " },\n", + " 2022: {\n", + " \"egrid_xlsx\": 'https://www.epa.gov/system/files/documents/2023-01/egrid2022_data.xlsx',\n", + " \"subregions_kmz\": 'https://www.epa.gov/system/files/other-files/2023-01/egrid2022_subregions.kmz',\n", + " },\n", + "}\n", + "\n", + "# Conversion factor from pounds to kilograms\n", + "LBS_PER_KG = 0.45359237\n", + "\n", + "# Function to download files\n", + "def download_file(url, local_path):\n", + " if not os.path.exists(local_path):\n", + " print(f\"Downloading from {url}...\")\n", + " response = requests.get(url)\n", + " if response.status_code == 200:\n", + " with open(local_path, 'wb') as f:\n", + " f.write(response.content)\n", + " print(f\"Downloaded to {local_path}.\")\n", + " else:\n", + " print(f\"Failed to download {url}. Status code: {response.status_code}\")\n", + " else:\n", + " print(f\"{local_path} already downloaded.\")\n", + "\n", + "# Function to extract data for a given year\n", + "def process_year(year, urls):\n", + " print(f\"\\nProcessing data for year {year}...\")\n", + "\n", + " # Paths for Excel and subregions files\n", + " egrid_xlsx_url = urls['egrid_xlsx']\n", + " egrid_xlsx_path = os.path.join(local_data_dir, f\"egrid{year}_data.xlsx\")\n", + "\n", + " # Determine the subregions file type and path\n", + " if 'subregions_kml' in urls:\n", + " subregions_url = urls['subregions_kml']\n", + " subregions_ext = 'kml'\n", + " elif 'subregions_kmz' in urls:\n", + " subregions_url = urls['subregions_kmz']\n", + " subregions_ext = 'kmz'\n", + " elif 'subregions_shp' in urls:\n", + " subregions_url = urls['subregions_shp']\n", + " subregions_ext = 'zip'\n", + " else:\n", + " print(f\"No subregions file provided for year {year}. Skipping...\")\n", + " return None\n", + "\n", + " subregions_path = os.path.join(local_data_dir, f\"egrid{year}_subregions.{subregions_ext}\")\n", + "\n", + " # Download files\n", + " download_file(egrid_xlsx_url, egrid_xlsx_path)\n", + " download_file(subregions_url, subregions_path)\n", + "\n", + " # Load carbon intensity data\n", + " try:\n", + " sheet_name = f'SRL{str(year)[-2:]}'\n", + " egrid_intensity_df = pd.read_excel(egrid_xlsx_path, sheet_name=sheet_name)\n", + "\n", + " # Use the relevant column for CO2 intensity\n", + " co2_column_name = [col for col in egrid_intensity_df.columns if 'CO2 total output emission rate (lb/MWh)' in col]\n", + " if co2_column_name:\n", + " co2_column_name = co2_column_name[0]\n", + " else:\n", + " raise ValueError(\"CO2 emission rate column not found.\")\n", + "\n", + " # Extract subregion acronym and carbon intensity\n", + " region_intensity = egrid_intensity_df[['eGRID subregion acronym', co2_column_name]].copy()\n", + " region_intensity.columns = ['SUBRGN', 'SRC2ERTA']\n", + "\n", + " # Convert to numeric and handle NaNs\n", + " region_intensity['SRC2ERTA'] = pd.to_numeric(region_intensity['SRC2ERTA'], errors='coerce')\n", + " region_intensity = region_intensity.dropna(subset=['SRC2ERTA'])\n", + "\n", + " # Convert lbs to kg\n", + " region_intensity['kg_per_mwh'] = region_intensity['SRC2ERTA'] * LBS_PER_KG\n", + " region_intensity = region_intensity[['SUBRGN', 'kg_per_mwh']]\n", + " region_intensity['SUBRGN'] = region_intensity['SUBRGN'].astype(str)\n", + "\n", + " print(f\"Extracted intensity data for {year}:\")\n", + " print(region_intensity.head())\n", + " print(f\"{year} - Available subregions in intensity data:\", region_intensity['SUBRGN'].unique())\n", + "\n", + " except Exception as e:\n", + " print(f\"Failed to read eGRID data for {year}: {e}\")\n", + " return None\n", + "\n", + " # Load subregions into GeoDataFrame\n", + " try:\n", + " if subregions_ext == 'kmz':\n", + " with ZipFile(subregions_path, 'r') as kmz:\n", + " kml_filename = [name for name in kmz.namelist() if name.endswith('.kml')][0]\n", + " kmz.extract(kml_filename, local_data_dir)\n", + " kml_path = os.path.join(local_data_dir, kml_filename)\n", + " gdf = gpd.read_file(kml_path, driver='KML')\n", + " os.remove(kml_path)\n", + " elif subregions_ext == 'kml':\n", + " gdf = gpd.read_file(subregions_path, driver='KML')\n", + " elif subregions_ext == 'zip':\n", + " # Extract the zip file\n", + " with ZipFile(subregions_path, 'r') as zip_ref:\n", + " zip_ref.extractall(local_data_dir)\n", + " # Find the shapefile (.shp)\n", + " shapefile_name = [name for name in zip_ref.namelist() if name.endswith('.shp')][0]\n", + " shapefile_path = os.path.join(local_data_dir, shapefile_name)\n", + " gdf = gpd.read_file(shapefile_path)\n", + " else:\n", + " print(f\"Unsupported subregions file format for year {year}.\")\n", + " return None\n", + "\n", + " # Check for 'SUBRGN' or 'Name' column and rename to 'Name'\n", + " if 'SUBRGN' in gdf.columns:\n", + " gdf = gdf.rename(columns={'SUBRGN': 'Name'})\n", + " elif 'Name' not in gdf.columns:\n", + " raise ValueError(\"Subregion name column not found in geospatial data.\")\n", + "\n", + " # Ensure geometries are in WGS84 CRS (EPSG:4326)\n", + " if gdf.crs != 'EPSG:4326':\n", + " print(f\"Reprojecting geometries to WGS84 for year {year}.\")\n", + " gdf = gdf.to_crs(epsg=4326)\n", + "\n", + " # Merge with carbon intensity data using 'Name' as the identifier\n", + " gdf = gdf.merge(region_intensity, left_on='Name', right_on='SUBRGN', how='left')\n", + "\n", + " except Exception as e:\n", + " print(f\"Failed to read subregions for {year}: {e}\")\n", + " return None\n", + "\n", + " # Adjust regions and return the processed GeoDataFrame\n", + " combined_gdf = adjust_regions(gdf, year)\n", + " return combined_gdf\n", + "\n", + "# Function to adjust regions (without plotting)\n", + "def adjust_regions(gdf, year):\n", + " # Define subregions for Alaska and Hawaii\n", + " alaska_subregions = ['AKGD', 'AKMS']\n", + " hawaii_subregions = ['HIMS', 'HIOA', 'HIMS/HIOA']\n", + " puerto_rico_subregions = ['PR', 'PRVI', 'PRMS']\n", + "\n", + " # Attempt to find Puerto Rico subregions in the data\n", + " pr_subregions = gdf[gdf['Name'].str.contains('PR', case=False)]['Name'].unique()\n", + " if len(pr_subregions) == 0:\n", + " # Try to identify PR subregions differently\n", + " pr_subregions = [name for name in gdf['Name'].unique() if 'PR' in name.upper()]\n", + " print(f\"Puerto Rico subregions in data: {pr_subregions}\")\n", + "\n", + " # Separate the regions\n", + " mainland_gdf = gdf[~gdf['Name'].isin(alaska_subregions + hawaii_subregions + list(pr_subregions))]\n", + " alaska_gdf = gdf[gdf['Name'].isin(alaska_subregions)]\n", + " hawaii_gdf = gdf[gdf['Name'].isin(hawaii_subregions)]\n", + " puerto_rico_gdf = gdf[gdf['Name'].isin(pr_subregions)]\n", + "\n", + " # Alaska: Scale and translate\n", + " if not alaska_gdf.empty:\n", + " alaska_gdf = alaska_gdf.copy()\n", + "\n", + " # Compute the centroid of Alaska (combined geometries)\n", + " alaska_centroid = alaska_gdf.geometry.unary_union.centroid\n", + "\n", + " # Desired location for Alaska after translation\n", + " alaska_desired_location = (-120, 27) # Adjust as needed\n", + "\n", + " # Apply scaling with the common origin (alaska_centroid)\n", + " alaska_gdf['geometry'] = alaska_gdf['geometry'].apply(\n", + " lambda geom: scale(geom, xfact=0.41, yfact=0.5, origin=alaska_centroid)\n", + " )\n", + "\n", + " # After scaling, compute the new centroid\n", + " scaled_alaska_centroid = alaska_gdf.geometry.unary_union.centroid\n", + "\n", + " # Compute translation amounts based on the desired location\n", + " delta_x = alaska_desired_location[0] - scaled_alaska_centroid.x\n", + " delta_y = alaska_desired_location[1] - scaled_alaska_centroid.y\n", + "\n", + " # Apply translation to each geometry\n", + " alaska_gdf['geometry'] = alaska_gdf['geometry'].apply(\n", + " lambda geom: translate(geom, xoff=delta_x, yoff=delta_y)\n", + " )\n", + " else:\n", + " print(\"Alaska data is not available.\")\n", + "\n", + " # Hawaii: Scale and translate\n", + " if not hawaii_gdf.empty:\n", + " hawaii_gdf = hawaii_gdf.copy()\n", + " hawaii_centroid = hawaii_gdf.geometry.unary_union.centroid\n", + "\n", + " # Apply scaling\n", + " hawaii_gdf['geometry'] = hawaii_gdf['geometry'].apply(\n", + " lambda geom: scale(geom, xfact=1.5, yfact=1.5, origin=hawaii_centroid)\n", + " )\n", + "\n", + " # Desired location for Hawaii after translation\n", + " hawaii_desired_location = (-105, 22) # Adjust as needed\n", + " delta_x = hawaii_desired_location[0] - hawaii_centroid.x\n", + " delta_y = hawaii_desired_location[1] - hawaii_centroid.y\n", + "\n", + " # Apply translation\n", + " hawaii_gdf['geometry'] = hawaii_gdf['geometry'].apply(\n", + " lambda geom: translate(geom, xoff=delta_x, yoff=delta_y)\n", + " )\n", + " else:\n", + " print(\"Hawaii data is not available.\")\n", + "\n", + " # Puerto Rico: Scale and translate\n", + " if not puerto_rico_gdf.empty:\n", + " puerto_rico_gdf = puerto_rico_gdf.copy()\n", + " pr_centroid = puerto_rico_gdf.geometry.unary_union.centroid\n", + "\n", + " # Apply scaling\n", + " puerto_rico_gdf['geometry'] = puerto_rico_gdf['geometry'].apply(\n", + " lambda geom: scale(geom, xfact=1.5, yfact=1.5, origin=pr_centroid)\n", + " )\n", + "\n", + " # Desired location for Puerto Rico after translation\n", + " pr_desired_location = (-75, 22) # Adjust as needed\n", + " delta_x = pr_desired_location[0] - pr_centroid.x\n", + " delta_y = pr_desired_location[1] - pr_centroid.y\n", + "\n", + " # Apply translation\n", + " puerto_rico_gdf['geometry'] = puerto_rico_gdf['geometry'].apply(\n", + " lambda geom: translate(geom, xoff=delta_x, yoff=delta_y)\n", + " )\n", + " else:\n", + " print(\"Puerto Rico data is not available.\")\n", + " # Create an empty GeoDataFrame to prevent errors during concatenation\n", + " puerto_rico_gdf = gpd.GeoDataFrame(columns=gdf.columns)\n", + "\n", + " # Combine all regions back into a single GeoDataFrame\n", + " combined_gdf = pd.concat(\n", + " [mainland_gdf, alaska_gdf, hawaii_gdf, puerto_rico_gdf], ignore_index=True\n", + " )\n", + "\n", + " # Remove any rows with missing kg_per_mwh\n", + " combined_gdf = combined_gdf.dropna(subset=['kg_per_mwh'])\n", + "\n", + " return combined_gdf\n", + "\n", + "# Process data for selected years and store GeoDataFrames\n", + "years_to_process = [2019, 2020, 2021, 2022]\n", + "processed_data = {}\n", + "for year in years_to_process:\n", + " if year in egrid_urls:\n", + " gdf = process_year(year, egrid_urls[year])\n", + " if gdf is not None:\n", + " processed_data[year] = gdf\n", + " else:\n", + " print(f\"No data available for year {year}.\")\n", + "\n", + "# Now plot subplots\n", + "num_years = len(processed_data)\n", + "fig, axes = plt.subplots(2, 2, figsize=(16, 12))\n", + "axes = axes.flatten()\n", + "\n", + "# Define common vmax and vmin for consistent color scaling\n", + "vmin = min(gdf['kg_per_mwh'].min() for gdf in processed_data.values())\n", + "vmax = max(gdf['kg_per_mwh'].max() for gdf in processed_data.values())\n", + "\n", + "for ax, (year, combined_gdf) in zip(axes, sorted(processed_data.items())):\n", + " combined_gdf.plot(\n", + " column='kg_per_mwh',\n", + " cmap='YlGnBu',\n", + " linewidth=0.8,\n", + " ax=ax,\n", + " edgecolor='0.8',\n", + " vmax=vmax,\n", + " vmin=vmin\n", + " )\n", + "\n", + " # Add labels for each subregion\n", + " combined_gdf['coords'] = combined_gdf['geometry'].apply(lambda x: x.representative_point().coords[0])\n", + " for idx, row in combined_gdf.iterrows():\n", + " x, y = row['coords']\n", + " # Manually adjust positions for overlapping labels\n", + " if row['Name'] == 'NYLI':\n", + " x_offset = 0.9\n", + " y_offset = -0.9\n", + " elif row['Name'] == 'NYCW':\n", + " x_offset = 0.5\n", + " y_offset = 1\n", + " elif row['Name'] == 'PRMS':\n", + " x_offset = 0\n", + " y_offset = 0.7\n", + " else:\n", + " x_offset = 0\n", + " y_offset = 0\n", + " ax.annotate(\n", + " text=row['Name'],\n", + " xy=(x + x_offset, y + y_offset),\n", + " horizontalalignment='center',\n", + " fontsize=8,\n", + " color='white',\n", + " path_effects=[pe.withStroke(linewidth=2, foreground=\"black\")]\n", + " )\n", + "\n", + " # Adjust plot limits to include the new regions\n", + " ax.set_xlim(-130, -65)\n", + " ax.set_ylim(20, 50)\n", + "\n", + " # Simplify the title to display only the year\n", + " ax.set_title(f\"{year}\", fontsize=15)\n", + "\n", + " ax.axis('off')\n", + "\n", + "\n", + "# Adjust layout and add a colorbar\n", + "plt.tight_layout()\n", + "fig.subplots_adjust(right=0.85, hspace=0.05) # Set hspace close to zero for minimal vertical spacing\n", + "\n", + "# Additional adjustment to overlap rows if needed\n", + "# Manually adjust each axis position to achieve a slight negative spacing effect\n", + "for i, ax in enumerate(axes):\n", + " pos = ax.get_position()\n", + " if i >= 2: # Only move the bottom row up\n", + " ax.set_position([pos.x0, pos.y0 + 0.12, pos.width, pos.height]) # Adjust as needed to control overlap\n", + "\n", + "cbar_ax = fig.add_axes([0.88, 0.24, 0.03, 0.7]) # Move up by increasing the bottom and adjust height as needed\n", + "\n", + "sm = plt.cm.ScalarMappable(cmap='YlGnBu', norm=plt.Normalize(vmin=vmin, vmax=vmax))\n", + "sm._A = [] # Empty array for the scalar mappable\n", + "cbar = fig.colorbar(sm, cax=cbar_ax)\n", + "cbar.set_label('kg CO₂e/MWh', fontsize=12)\n", + "\n", + "plt.savefig(\"egrid_carbon_intensity_subplots.png\", dpi=300, bbox_inches='tight')\n", + "plt.savefig(\"egrid_carbon_intensity_subplots.pdf\", dpi=150, bbox_inches='tight')\n", + "\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import requests\n", + "from zipfile import ZipFile\n", + "import pandas as pd\n", + "import geopandas as gpd\n", + "import matplotlib.pyplot as plt\n", + "from shapely.affinity import scale, translate\n", + "import matplotlib.patheffects as pe\n", + "\n", + "# Directory to store downloaded data\n", + "local_data_dir = \"./data/\"\n", + "os.makedirs(local_data_dir, exist_ok=True)\n", + "\n", + "# URLs for the 2014 shapefile and data file\n", + "data_file_url = \"https://www.epa.gov/system/files/other-files/2023-01/egrid_historical_files_1996-2016.zip\"\n", + "shapefile_url = \"https://www.epa.gov/sites/default/files/2017-01/egrid_subregions.zip\"\n", + "\n", + "# Paths for downloaded ZIP files\n", + "data_file_zip_path = os.path.join(local_data_dir, \"egrid_historical_files_1996-2016.zip\")\n", + "shapefile_zip_path = os.path.join(local_data_dir, \"egrid_subregions.zip\")\n", + "\n", + "# Directory paths for extracted files\n", + "data_file_dir = os.path.join(local_data_dir, \"egrid_historical_files_1996-2016/egrid_historical_files_1996-2016\") # Updated to inner directory\n", + "shapefile_dir = os.path.join(local_data_dir, \"egrid_subregions\")\n", + "\n", + "# Download function\n", + "def download_file(url, local_path):\n", + " if not os.path.exists(local_path):\n", + " print(f\"Downloading from {url}...\")\n", + " response = requests.get(url)\n", + " if response.status_code == 200:\n", + " with open(local_path, 'wb') as f:\n", + " f.write(response.content)\n", + " print(f\"Downloaded to {local_path}.\")\n", + " else:\n", + " print(f\"Failed to download {url}. Status code: {response.status_code}\")\n", + " else:\n", + " print(f\"{local_path} already downloaded.\")\n", + "\n", + "# Extract function\n", + "def extract_zip(zip_path, extract_to):\n", + " if not os.path.exists(extract_to):\n", + " os.makedirs(extract_to, exist_ok=True)\n", + " with ZipFile(zip_path, 'r') as zip_ref:\n", + " zip_ref.extractall(extract_to)\n", + " print(f\"Extracted {zip_path} to {extract_to}.\")\n", + " else:\n", + " print(f\"{extract_to} already exists. Extraction skipped.\")\n", + "\n", + "# Download and extract data files\n", + "download_file(data_file_url, data_file_zip_path)\n", + "extract_zip(data_file_zip_path, os.path.join(local_data_dir, \"egrid_historical_files_1996-2016\"))\n", + "\n", + "# Download and extract shapefile\n", + "download_file(shapefile_url, shapefile_zip_path)\n", + "extract_zip(shapefile_zip_path, shapefile_dir)\n", + "\n", + "# Path to the 2014 data file\n", + "data_file_path = os.path.join(data_file_dir, \"eGRID2014_Data_v2.xlsx\")\n", + "\n", + "# Load carbon intensity data from the Excel file\n", + "if not os.path.exists(data_file_path):\n", + " print(f\"Data file {data_file_path} not found. Available files in the directory:\")\n", + " for root, dirs, files in os.walk(data_file_dir):\n", + " for file in files:\n", + " print(os.path.join(root, file))\n", + "else:\n", + " # Read the specific worksheet and columns\n", + " intensity_df = pd.read_excel(data_file_path, sheet_name=\"SRL14\", usecols=[\"SUBRGN\", \"SRC2ERTA\"], skiprows=1)\n", + " intensity_df[\"SRC2ERTA\"] = pd.to_numeric(intensity_df[\"SRC2ERTA\"], errors=\"coerce\") * 0.45359237 # Convert lbs to kg\n", + " intensity_df = intensity_df.dropna(subset=[\"SRC2ERTA\"])\n", + "\n", + " print(\"Sample of carbon intensity data:\")\n", + " print(intensity_df.head())\n", + "\n", + "\n", + " # Load the 2014 shapefile\n", + " shapefile_path = os.path.join(shapefile_dir, \"eGRID2014_subregions.shp\")\n", + " if not os.path.exists(shapefile_path):\n", + " print(f\"Shapefile {shapefile_path} not found.\")\n", + " else:\n", + " gdf = gpd.read_file(shapefile_path)\n", + "\n", + " # Print available columns to identify the correct subregion code column\n", + " print(\"Columns in shapefile:\", gdf.columns)\n", + "\n", + " # Ensure geometries are in WGS84 CRS (EPSG:4326)\n", + " if gdf.crs != 'EPSG:4326':\n", + " print(\"Reprojecting geometries to WGS84 (EPSG:4326).\")\n", + " gdf = gdf.to_crs(epsg=4326)\n", + "\n", + " # Assuming 'SUBRGN' is the correct column name based on printed columns\n", + " # Adjust the variable name if needed\n", + " subregion_column = 'SUBRGN' # Replace 'SUBRGN' if another name is used for subregions\n", + "\n", + " # Print a sample of the 'zips_for_G' column to inspect its contents\n", + " print(\"Sample values in 'zips_for_G':\")\n", + " print(gdf['zips_for_G'].head())\n", + "\n", + " # If 'zips_for_G' contains subregion codes, use it as the subregion column for merging\n", + " subregion_column = 'zips_for_G' # Update this if necessary after inspecting the values\n", + "\n", + " # Merge shapefile with carbon intensity data\n", + " gdf = gdf.merge(intensity_df, left_on=subregion_column, right_on=\"SUBRGN\", how=\"left\")\n", + "\n", + " # The rest of the processing and plotting logic follows...\n", + "\n", + " # Define subregions for Alaska, Hawaii, and Puerto Rico\n", + " alaska_subregions = ['AKGD', 'AKMS']\n", + " hawaii_subregions = ['HIMS', 'HIOA', 'HIMS/HIOA']\n", + " puerto_rico_subregions = ['PR', 'PRVI', 'PRMS']\n", + "\n", + " # Separate regions for specific adjustments\n", + " mainland_gdf = gdf[~gdf[subregion_column].isin(alaska_subregions + hawaii_subregions + puerto_rico_subregions)]\n", + " alaska_gdf = gdf[gdf[subregion_column].isin(alaska_subregions)]\n", + " hawaii_gdf = gdf[gdf[subregion_column].isin(hawaii_subregions)]\n", + " puerto_rico_gdf = gdf[gdf[subregion_column].isin(puerto_rico_subregions)]\n", + "\n", + " # Alaska: Scale and translate\n", + " if not alaska_gdf.empty:\n", + " alaska_centroid = alaska_gdf.geometry.unary_union.centroid\n", + " alaska_desired_location = (-120, 27)\n", + " alaska_gdf['geometry'] = alaska_gdf['geometry'].apply(\n", + " lambda geom: translate(scale(geom, xfact=0.41, yfact=0.5, origin=alaska_centroid),\n", + " xoff=alaska_desired_location[0] - alaska_centroid.x,\n", + " yoff=alaska_desired_location[1] - alaska_centroid.y)\n", + " )\n", + "\n", + " # Hawaii: Scale and translate\n", + " if not hawaii_gdf.empty:\n", + " hawaii_centroid = hawaii_gdf.geometry.unary_union.centroid\n", + " hawaii_desired_location = (-105, 22)\n", + " hawaii_gdf['geometry'] = hawaii_gdf['geometry'].apply(\n", + " lambda geom: translate(scale(geom, xfact=1.5, yfact=1.5, origin=hawaii_centroid),\n", + " xoff=hawaii_desired_location[0] - hawaii_centroid.x,\n", + " yoff=hawaii_desired_location[1] - hawaii_centroid.y)\n", + " )\n", + "\n", + " # Puerto Rico: Scale and translate\n", + " if not puerto_rico_gdf.empty:\n", + " pr_centroid = puerto_rico_gdf.geometry.unary_union.centroid\n", + " pr_desired_location = (-75, 22)\n", + " puerto_rico_gdf['geometry'] = puerto_rico_gdf['geometry'].apply(\n", + " lambda geom: translate(scale(geom, xfact=1.5, yfact=1.5, origin=pr_centroid),\n", + " xoff=pr_desired_location[0] - pr_centroid.x,\n", + " yoff=pr_desired_location[1] - pr_centroid.y)\n", + " )\n", + "\n", + " # Combine all regions back into a single GeoDataFrame\n", + " combined_gdf = pd.concat([mainland_gdf, alaska_gdf, hawaii_gdf, puerto_rico_gdf], ignore_index=True)\n", + "\n", + " # Plot\n", + " fig, ax = plt.subplots(figsize=(12, 8))\n", + " combined_gdf.plot(column='SRC2ERTA', cmap='YlGnBu', linewidth=0.8, ax=ax, edgecolor='0.8', legend=True)\n", + "\n", + " # Add labels for each subregion\n", + " combined_gdf['coords'] = combined_gdf['geometry'].apply(lambda x: x.representative_point().coords[0])\n", + " for idx, row in combined_gdf.iterrows():\n", + " x, y = row['coords']\n", + " ax.annotate(\n", + " text=row[subregion_column],\n", + " xy=(x, y),\n", + " horizontalalignment='center',\n", + " fontsize=8,\n", + " color='white',\n", + " path_effects=[pe.withStroke(linewidth=2, foreground=\"black\")]\n", + " )\n", + "\n", + " # Customize plot appearance\n", + " # Adjust plot limits to include the new regions\n", + " ax.set_xlim(-130, -65)\n", + " ax.set_ylim(20, 50)\n", + "\n", + " ax.set_title(\"eGRID Subregions - 2014 CO₂ Intensity (kg/MWh)\", fontsize=15)\n", + " ax.axis('off')\n", + " plt.tight_layout()\n", + " plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# gdf = extract_to_geojson(kml_path, year)\n", + "# print(gdf.head()) # Debugging line\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# print(gdf['Description'].head(10)) # Print the first 10 descriptions to see what they look like\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# new 2014" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import requests\n", + "from zipfile import ZipFile\n", + "import pandas as pd\n", + "import geopandas as gpd\n", + "import matplotlib.pyplot as plt\n", + "from shapely.affinity import scale, translate\n", + "import matplotlib.patheffects as pe\n", + "\n", + "# Directory to store downloaded data\n", + "local_data_dir = \"./data/\"\n", + "os.makedirs(local_data_dir, exist_ok=True)\n", + "\n", + "# eGRID Data URLs\n", + "egrid_urls = {\n", + " 2014: {\n", + " \"data_zip\": \"https://www.epa.gov/system/files/other-files/2023-01/egrid_historical_files_1996-2016.zip\",\n", + " \"shapefile_zip\": \"https://www.epa.gov/sites/default/files/2017-01/egrid_subregions.zip\",\n", + " },\n", + " 2019: {\n", + " \"egrid_xlsx\": 'https://www.epa.gov/sites/default/files/2021-02/egrid2019_data.xlsx',\n", + " \"subregions_kmz\": 'https://www.epa.gov/sites/default/files/2021-02/egrid2019_subregions.kmz',\n", + " },\n", + " 2021: {\n", + " \"egrid_xlsx\": 'https://www.epa.gov/system/files/documents/2023-01/eGRID2021_data.xlsx',\n", + " \"subregions_shp\": 'https://gaftp.epa.gov/EPADataCommons/ORD/egrid2021/eGRID2021%20Subregions%20Shapefile.zip',\n", + " },\n", + " 2022: {\n", + " \"egrid_xlsx\": 'https://www.epa.gov/system/files/documents/2023-01/egrid2022_data.xlsx',\n", + " \"subregions_kmz\": 'https://www.epa.gov/system/files/other-files/2023-01/egrid2022_subregions.kmz',\n", + " },\n", + "}\n", + "\n", + "# Conversion factor from pounds to kilograms\n", + "LBS_PER_KG = 0.45359237\n", + "\n", + "# Function to download files\n", + "def download_file(url, local_path):\n", + " if not os.path.exists(local_path):\n", + " print(f\"Downloading from {url}...\")\n", + " response = requests.get(url)\n", + " if response.status_code == 200:\n", + " with open(local_path, 'wb') as f:\n", + " f.write(response.content)\n", + " print(f\"Downloaded to {local_path}.\")\n", + " else:\n", + " print(f\"Failed to download {url}. Status code: {response.status_code}\")\n", + " else:\n", + " print(f\"{local_path} already downloaded.\")\n", + "\n", + "# Function to extract ZIP files\n", + "def extract_zip(zip_path, extract_to):\n", + " if not os.path.exists(extract_to):\n", + " os.makedirs(extract_to, exist_ok=True)\n", + " with ZipFile(zip_path, 'r') as zip_ref:\n", + " zip_ref.extractall(extract_to)\n", + " print(f\"Extracted {zip_path} to {extract_to}.\")\n", + " else:\n", + " print(f\"{extract_to} already exists. Extraction skipped.\")\n", + "\n", + "# Function to process data for 2014\n", + "def process_2014_data(urls):\n", + " print(\"Processing 2014 data with specialized logic...\")\n", + "\n", + " # URLs for the 2014 shapefile and data file\n", + " data_file_url = urls[\"data_zip\"]\n", + " shapefile_url = urls[\"shapefile_zip\"]\n", + "\n", + " # Paths for downloaded ZIP files\n", + " data_file_zip_path = os.path.join(local_data_dir, \"egrid_historical_files_1996-2016.zip\")\n", + " shapefile_zip_path = os.path.join(local_data_dir, \"egrid_subregions.zip\")\n", + "\n", + " # Directory paths for extracted files\n", + " data_file_dir = os.path.join(local_data_dir, \"egrid_historical_files_1996-2016\")\n", + " shapefile_dir = os.path.join(local_data_dir, \"egrid_subregions\")\n", + "\n", + " # Download and extract data files\n", + " download_file(data_file_url, data_file_zip_path)\n", + " extract_zip(data_file_zip_path, data_file_dir)\n", + "\n", + " # Download and extract shapefile\n", + " download_file(shapefile_url, shapefile_zip_path)\n", + " extract_zip(shapefile_zip_path, shapefile_dir)\n", + "\n", + " # Adjust data_file_dir to account for nested directories\n", + " nested_data_dir = os.path.join(data_file_dir, \"egrid_historical_files_1996-2016\")\n", + " if os.path.exists(nested_data_dir):\n", + " data_file_dir = nested_data_dir\n", + "\n", + " # Path to the 2014 data file\n", + " data_file_path = os.path.join(data_file_dir, \"eGRID2014_Data_v2.xlsx\")\n", + "\n", + " # Verify that the data file exists\n", + " if not os.path.exists(data_file_path):\n", + " print(f\"Data file {data_file_path} not found. Available files in the directory:\")\n", + " for root, dirs, files in os.walk(data_file_dir):\n", + " for file in files:\n", + " print(os.path.join(root, file))\n", + " return None\n", + "\n", + " # Load carbon intensity data from the Excel file\n", + " try:\n", + " # Read the specific worksheet and columns\n", + " intensity_df = pd.read_excel(data_file_path, sheet_name=\"SRL14\", usecols=[\"SUBRGN\", \"SRC2ERTA\"], skiprows=1)\n", + " intensity_df[\"SRC2ERTA\"] = pd.to_numeric(intensity_df[\"SRC2ERTA\"], errors=\"coerce\")\n", + " intensity_df = intensity_df.dropna(subset=[\"SRC2ERTA\"])\n", + "\n", + " # Convert lbs to kg\n", + " intensity_df[\"kg_per_mwh\"] = intensity_df[\"SRC2ERTA\"] * LBS_PER_KG\n", + "\n", + " print(\"Sample of carbon intensity data:\")\n", + " print(intensity_df.head())\n", + "\n", + " except Exception as e:\n", + " print(f\"Failed to read eGRID data for 2014: {e}\")\n", + " return None\n", + "\n", + " # Load the 2014 shapefile\n", + " try:\n", + " shapefile_path = os.path.join(shapefile_dir, \"eGRID2014_subregions.shp\")\n", + " gdf = gpd.read_file(shapefile_path)\n", + "\n", + " # Determine the subregion column\n", + " if 'SUBRGN' in gdf.columns:\n", + " subregion_column = 'SUBRGN'\n", + " elif 'zips_for_G' in gdf.columns:\n", + " subregion_column = 'zips_for_G'\n", + " else:\n", + " print(\"Subregion column not found in shapefile.\")\n", + " return None\n", + "\n", + " # Ensure geometries are in WGS84 CRS (EPSG:4326)\n", + " if gdf.crs != 'EPSG:4326':\n", + " print(\"Reprojecting geometries to WGS84 (EPSG:4326).\")\n", + " gdf = gdf.to_crs(epsg=4326)\n", + "\n", + " # Merge shapefile with carbon intensity data\n", + " gdf = gdf.merge(intensity_df[['SUBRGN', 'kg_per_mwh']], left_on=subregion_column, right_on=\"SUBRGN\", how=\"left\")\n", + "\n", + " # Rename the subregion column to 'Name' to match other years\n", + " gdf = gdf.rename(columns={subregion_column: 'Name'})\n", + "\n", + " except Exception as e:\n", + " print(f\"Failed to read subregions for 2014: {e}\")\n", + " return None\n", + "\n", + " # Adjust regions and return the processed GeoDataFrame\n", + " combined_gdf = adjust_regions(gdf, 2014)\n", + " return combined_gdf\n", + "\n", + "# Function to process data for other years\n", + "def process_year(year, urls):\n", + " print(f\"\\nProcessing data for year {year}...\")\n", + "\n", + " if year == 2014:\n", + " return process_2014_data(urls)\n", + " else:\n", + " # Paths for Excel and subregions files\n", + " egrid_xlsx_url = urls['egrid_xlsx']\n", + " egrid_xlsx_path = os.path.join(local_data_dir, f\"egrid{year}_data.xlsx\")\n", + "\n", + " # Determine the subregions file type and path\n", + " if 'subregions_kml' in urls:\n", + " subregions_url = urls['subregions_kml']\n", + " subregions_ext = 'kml'\n", + " elif 'subregions_kmz' in urls:\n", + " subregions_url = urls['subregions_kmz']\n", + " subregions_ext = 'kmz'\n", + " elif 'subregions_shp' in urls:\n", + " subregions_url = urls['subregions_shp']\n", + " subregions_ext = 'zip'\n", + " else:\n", + " print(f\"No subregions file provided for year {year}. Skipping...\")\n", + " return None\n", + "\n", + " subregions_path = os.path.join(local_data_dir, f\"egrid{year}_subregions.{subregions_ext}\")\n", + "\n", + " # Download files\n", + " download_file(egrid_xlsx_url, egrid_xlsx_path)\n", + " download_file(subregions_url, subregions_path)\n", + "\n", + " # Load carbon intensity data\n", + " try:\n", + " sheet_name = f'SRL{str(year)[-2:]}'\n", + " egrid_intensity_df = pd.read_excel(egrid_xlsx_path, sheet_name=sheet_name)\n", + "\n", + " # Use the relevant column for CO₂ intensity\n", + " co2_column_name = [col for col in egrid_intensity_df.columns if 'CO2 total output emission rate (lb/MWh)' in col]\n", + " if co2_column_name:\n", + " co2_column_name = co2_column_name[0]\n", + " else:\n", + " raise ValueError(\"CO₂ emission rate column not found.\")\n", + "\n", + " # Extract subregion acronym and carbon intensity\n", + " region_intensity = egrid_intensity_df[['eGRID subregion acronym', co2_column_name]].copy()\n", + " region_intensity.columns = ['SUBRGN', 'SRC2ERTA']\n", + "\n", + " # Convert to numeric and handle NaNs\n", + " region_intensity['SRC2ERTA'] = pd.to_numeric(region_intensity['SRC2ERTA'], errors='coerce')\n", + " region_intensity = region_intensity.dropna(subset=['SRC2ERTA'])\n", + "\n", + " # Convert lbs to kg\n", + " region_intensity['kg_per_mwh'] = region_intensity['SRC2ERTA'] * LBS_PER_KG\n", + " region_intensity = region_intensity[['SUBRGN', 'kg_per_mwh']]\n", + " region_intensity['SUBRGN'] = region_intensity['SUBRGN'].astype(str)\n", + "\n", + " print(f\"Extracted intensity data for {year}:\")\n", + " print(region_intensity.head())\n", + "\n", + " except Exception as e:\n", + " print(f\"Failed to read eGRID data for {year}: {e}\")\n", + " return None\n", + "\n", + " # Load subregions into GeoDataFrame\n", + " try:\n", + " if subregions_ext == 'kmz':\n", + " with ZipFile(subregions_path, 'r') as kmz:\n", + " kml_filename = [name for name in kmz.namelist() if name.endswith('.kml')][0]\n", + " kmz.extract(kml_filename, local_data_dir)\n", + " kml_path = os.path.join(local_data_dir, kml_filename)\n", + " gdf = gpd.read_file(kml_path, driver='KML')\n", + " os.remove(kml_path)\n", + " elif subregions_ext == 'kml':\n", + " gdf = gpd.read_file(subregions_path, driver='KML')\n", + " elif subregions_ext == 'zip':\n", + " # Extract the zip file\n", + " with ZipFile(subregions_path, 'r') as zip_ref:\n", + " zip_ref.extractall(local_data_dir)\n", + " # Find the shapefile (.shp)\n", + " shapefile_name = [name for name in zip_ref.namelist() if name.endswith('.shp')][0]\n", + " shapefile_path = os.path.join(local_data_dir, shapefile_name)\n", + " gdf = gpd.read_file(shapefile_path)\n", + " else:\n", + " print(f\"Unsupported subregions file format for year {year}.\")\n", + " return None\n", + "\n", + " # Check for 'SUBRGN' or 'Name' column and rename to 'Name'\n", + " if 'SUBRGN' in gdf.columns:\n", + " gdf = gdf.rename(columns={'SUBRGN': 'Name'})\n", + " elif 'Name' not in gdf.columns:\n", + " raise ValueError(\"Subregion name column not found in geospatial data.\")\n", + "\n", + " # Ensure geometries are in WGS84 CRS (EPSG:4326)\n", + " if gdf.crs != 'EPSG:4326':\n", + " print(f\"Reprojecting geometries to WGS84 for year {year}.\")\n", + " gdf = gdf.to_crs(epsg=4326)\n", + "\n", + " # Merge with carbon intensity data using 'Name' as the identifier\n", + " gdf = gdf.merge(region_intensity, left_on='Name', right_on='SUBRGN', how='left')\n", + "\n", + " except Exception as e:\n", + " print(f\"Failed to read subregions for {year}: {e}\")\n", + " return None\n", + "\n", + " # Adjust regions and return the processed GeoDataFrame\n", + " combined_gdf = adjust_regions(gdf, year)\n", + " return combined_gdf\n", + "\n", + "# Function to adjust regions (without plotting)\n", + "def adjust_regions(gdf, year):\n", + " # Define subregions for Alaska and Hawaii\n", + " alaska_subregions = ['AKGD', 'AKMS']\n", + " hawaii_subregions = ['HIMS', 'HIOA', 'HIMS/HIOA']\n", + " puerto_rico_subregions = ['PR', 'PRVI', 'PRMS']\n", + "\n", + " # Attempt to find Puerto Rico subregions in the data\n", + " pr_subregions = gdf[gdf['Name'].str.contains('PR', case=False, na=False)]['Name'].unique()\n", + " if len(pr_subregions) == 0:\n", + " # Try to identify PR subregions differently\n", + " pr_subregions = [name for name in gdf['Name'].unique() if 'PR' in str(name).upper()]\n", + " print(f\"Puerto Rico subregions in data for {year}: {pr_subregions}\")\n", + "\n", + " # Separate the regions\n", + " mainland_gdf = gdf[~gdf['Name'].isin(alaska_subregions + hawaii_subregions + list(pr_subregions))]\n", + " alaska_gdf = gdf[gdf['Name'].isin(alaska_subregions)]\n", + " hawaii_gdf = gdf[gdf['Name'].isin(hawaii_subregions)]\n", + " puerto_rico_gdf = gdf[gdf['Name'].isin(pr_subregions)]\n", + "\n", + " # Alaska: Scale and translate\n", + " if not alaska_gdf.empty:\n", + " alaska_gdf = alaska_gdf.copy()\n", + "\n", + " # Compute the centroid of Alaska (combined geometries)\n", + " alaska_centroid = alaska_gdf.geometry.unary_union.centroid\n", + "\n", + " # Desired location for Alaska after translation\n", + " alaska_desired_location = (-120, 27) # Adjust as needed\n", + "\n", + " # Apply scaling with the common origin (alaska_centroid)\n", + " alaska_gdf['geometry'] = alaska_gdf['geometry'].apply(\n", + " lambda geom: scale(geom, xfact=0.41, yfact=0.5, origin=alaska_centroid)\n", + " )\n", + "\n", + " # After scaling, compute the new centroid\n", + " scaled_alaska_centroid = alaska_gdf.geometry.unary_union.centroid\n", + "\n", + " # Compute translation amounts based on the desired location\n", + " delta_x = alaska_desired_location[0] - scaled_alaska_centroid.x\n", + " delta_y = alaska_desired_location[1] - scaled_alaska_centroid.y\n", + "\n", + " # Apply translation to each geometry\n", + " alaska_gdf['geometry'] = alaska_gdf['geometry'].apply(\n", + " lambda geom: translate(geom, xoff=delta_x, yoff=delta_y)\n", + " )\n", + " else:\n", + " print(f\"Alaska data is not available for {year}.\")\n", + "\n", + " # Hawaii: Scale and translate\n", + " if not hawaii_gdf.empty:\n", + " hawaii_gdf = hawaii_gdf.copy()\n", + " hawaii_centroid = hawaii_gdf.geometry.unary_union.centroid\n", + "\n", + " # Apply scaling\n", + " hawaii_gdf['geometry'] = hawaii_gdf['geometry'].apply(\n", + " lambda geom: scale(geom, xfact=1.5, yfact=1.5, origin=hawaii_centroid)\n", + " )\n", + "\n", + " # Desired location for Hawaii after translation\n", + " hawaii_desired_location = (-105, 22) # Adjust as needed\n", + " delta_x = hawaii_desired_location[0] - hawaii_centroid.x\n", + " delta_y = hawaii_desired_location[1] - hawaii_centroid.y\n", + "\n", + " # Apply translation\n", + " hawaii_gdf['geometry'] = hawaii_gdf['geometry'].apply(\n", + " lambda geom: translate(geom, xoff=delta_x, yoff=delta_y)\n", + " )\n", + " else:\n", + " print(f\"Hawaii data is not available for {year}.\")\n", + "\n", + " # Puerto Rico: Scale and translate\n", + " if not puerto_rico_gdf.empty:\n", + " puerto_rico_gdf = puerto_rico_gdf.copy()\n", + " pr_centroid = puerto_rico_gdf.geometry.unary_union.centroid\n", + "\n", + " # Apply scaling\n", + " puerto_rico_gdf['geometry'] = puerto_rico_gdf['geometry'].apply(\n", + " lambda geom: scale(geom, xfact=1.5, yfact=1.5, origin=pr_centroid)\n", + " )\n", + "\n", + " # Desired location for Puerto Rico after translation\n", + " pr_desired_location = (-75, 22) # Adjust as needed\n", + " delta_x = pr_desired_location[0] - pr_centroid.x\n", + " delta_y = pr_desired_location[1] - pr_centroid.y\n", + "\n", + " # Apply translation\n", + " puerto_rico_gdf['geometry'] = puerto_rico_gdf['geometry'].apply(\n", + " lambda geom: translate(geom, xoff=delta_x, yoff=delta_y)\n", + " )\n", + " else:\n", + " print(f\"Puerto Rico data is not available for {year}.\")\n", + "\n", + " # Combine all regions back into a single GeoDataFrame\n", + " combined_gdf = pd.concat(\n", + " [mainland_gdf, alaska_gdf, hawaii_gdf, puerto_rico_gdf], ignore_index=True\n", + " )\n", + "\n", + " # Remove any rows with missing kg_per_mwh\n", + " combined_gdf = combined_gdf.dropna(subset=['kg_per_mwh'])\n", + "\n", + " return combined_gdf\n", + "\n", + "# Process data for selected years and store GeoDataFrames\n", + "years_to_process = [2014, 2019, 2021, 2022]\n", + "processed_data = {}\n", + "for year in years_to_process:\n", + " if year in egrid_urls:\n", + " gdf = process_year(year, egrid_urls[year])\n", + " if gdf is not None:\n", + " processed_data[year] = gdf\n", + " else:\n", + " print(f\"No data available for year {year}.\")\n", + "\n", + "# Now plot subplots\n", + "num_years = len(processed_data)\n", + "fig, axes = plt.subplots(2, 2, figsize=(16, 12))\n", + "axes = axes.flatten()\n", + "\n", + "# Define common vmax and vmin for consistent color scaling\n", + "vmin = min(gdf['kg_per_mwh'].min() for gdf in processed_data.values())\n", + "vmax = max(gdf['kg_per_mwh'].max() for gdf in processed_data.values())\n", + "\n", + "for ax, (year, combined_gdf) in zip(axes, sorted(processed_data.items())):\n", + " combined_gdf.plot(\n", + " column='kg_per_mwh',\n", + " cmap='YlGnBu',\n", + " linewidth=0.8,\n", + " ax=ax,\n", + " edgecolor='0.8',\n", + " vmax=vmax,\n", + " vmin=vmin\n", + " )\n", + "\n", + " # Add labels for each subregion\n", + " combined_gdf['coords'] = combined_gdf['geometry'].apply(lambda x: x.representative_point().coords[0])\n", + " for idx, row in combined_gdf.iterrows():\n", + " x, y = row['coords']\n", + " # Manually adjust positions for overlapping labels\n", + " if row['Name'] == 'NYLI':\n", + " x_offset = 0.9\n", + " y_offset = -0.9\n", + " elif row['Name'] == 'NYCW':\n", + " x_offset = 0.5\n", + " y_offset = 1\n", + " elif row['Name'] == 'PRMS':\n", + " x_offset = 0\n", + " y_offset = 0.7\n", + " else:\n", + " x_offset = 0\n", + " y_offset = 0\n", + " ax.annotate(\n", + " text=row['Name'],\n", + " xy=(x + x_offset, y + y_offset),\n", + " horizontalalignment='center',\n", + " fontsize=8,\n", + " color='white',\n", + " path_effects=[pe.withStroke(linewidth=2, foreground=\"black\")]\n", + " )\n", + "\n", + " # Adjust plot limits to include the new regions\n", + " ax.set_xlim(-130, -65)\n", + " ax.set_ylim(20, 50)\n", + "\n", + " # Simplify the title to display only the year\n", + " ax.set_title(f\"{year}\", fontsize=15)\n", + "\n", + " ax.axis('off')\n", + "\n", + "# Adjust layout and add a colorbar\n", + "plt.tight_layout()\n", + "fig.subplots_adjust(right=0.85, hspace=0.05) # Set hspace close to zero for minimal vertical spacing\n", + "\n", + "# Additional adjustment to overlap rows if needed\n", + "# Manually adjust each axis position to achieve a slight negative spacing effect\n", + "for i, ax in enumerate(axes):\n", + " pos = ax.get_position()\n", + " if i >= 2: # Only move the bottom row up\n", + " ax.set_position([pos.x0, pos.y0 + 0.12, pos.width, pos.height]) # Adjust as needed\n", + "\n", + "cbar_ax = fig.add_axes([0.88, 0.24, 0.03, 0.7]) # Adjust as needed\n", + "\n", + "sm = plt.cm.ScalarMappable(cmap='YlGnBu', norm=plt.Normalize(vmin=vmin, vmax=vmax))\n", + "sm._A = [] # Empty array for the scalar mappable\n", + "cbar = fig.colorbar(sm, cax=cbar_ax)\n", + "cbar.set_label('kg CO₂e/MWh', fontsize=12)\n", + "\n", + "plt.savefig(\"egrid_carbon_intensity_subplots.png\", dpi=300, bbox_inches='tight')\n", + "plt.savefig(\"egrid_carbon_intensity_subplots.pdf\", dpi=150, bbox_inches='tight')\n", + "\n", + "plt.show()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Delta" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import requests\n", + "from zipfile import ZipFile\n", + "import pandas as pd\n", + "import geopandas as gpd\n", + "import matplotlib.pyplot as plt\n", + "from shapely.affinity import scale, translate\n", + "import matplotlib.patheffects as pe\n", + "import warnings\n", + "\n", + "# Suppress warnings for cleaner output\n", + "warnings.filterwarnings('ignore')\n", + "\n", + "# Directory to store downloaded data\n", + "local_data_dir = \"./data/\"\n", + "os.makedirs(local_data_dir, exist_ok=True)\n", + "\n", + "# eGRID Data URLs\n", + "egrid_urls = {\n", + " 2014: {\n", + " \"data_zip\": \"https://www.epa.gov/system/files/other-files/2023-01/egrid_historical_files_1996-2016.zip\",\n", + " \"shapefile_zip\": \"https://www.epa.gov/sites/default/files/2017-01/egrid_subregions.zip\",\n", + " },\n", + " 2020: {\n", + " \"egrid_xlsx\": 'https://www.epa.gov/system/files/documents/2022-09/eGRID2020_Data_v2.xlsx',\n", + " \"subregions_kmz\": 'https://www.epa.gov/system/files/other-files/2022-01/egrid2020_subregions.kmz',\n", + " },\n", + " 2022: {\n", + " \"egrid_xlsx\": 'https://www.epa.gov/system/files/documents/2023-01/egrid2022_data.xlsx',\n", + " \"subregions_kmz\": 'https://www.epa.gov/system/files/other-files/2023-01/egrid2022_subregions.kmz',\n", + " },\n", + "}\n", + "\n", + "# Conversion factor from pounds to kilograms\n", + "LBS_PER_KG = 0.45359237\n", + "\n", + "# Function to download files\n", + "def download_file(url, local_path):\n", + " if not os.path.exists(local_path):\n", + " print(f\"Downloading from {url}...\")\n", + " response = requests.get(url, verify=False) # Added verify=False to handle SSL issues\n", + " if response.status_code == 200:\n", + " with open(local_path, 'wb') as f:\n", + " f.write(response.content)\n", + " print(f\"Downloaded to {local_path}.\")\n", + " else:\n", + " print(f\"Failed to download {url}. Status code: {response.status_code}\")\n", + " else:\n", + " print(f\"{local_path} already downloaded.\")\n", + "\n", + "# Function to extract ZIP files\n", + "def extract_zip(zip_path, extract_to):\n", + " if not os.path.exists(extract_to):\n", + " os.makedirs(extract_to, exist_ok=True)\n", + " with ZipFile(zip_path, 'r') as zip_ref:\n", + " zip_ref.extractall(extract_to)\n", + " print(f\"Extracted {zip_path} to {extract_to}.\")\n", + " else:\n", + " print(f\"{extract_to} already exists. Extraction skipped.\")\n", + "\n", + "# Function to process data for 2014\n", + "def process_2014_data(urls):\n", + " print(\"Processing 2014 data with specialized logic...\")\n", + "\n", + " # URLs for the 2014 shapefile and data file\n", + " data_file_url = urls[\"data_zip\"]\n", + " shapefile_url = urls[\"shapefile_zip\"]\n", + "\n", + " # Paths for downloaded ZIP files\n", + " data_file_zip_path = os.path.join(local_data_dir, \"egrid_historical_files_1996-2016.zip\")\n", + " shapefile_zip_path = os.path.join(local_data_dir, \"egrid_subregions_2014.zip\")\n", + "\n", + " # Directory paths for extracted files\n", + " data_file_dir = os.path.join(local_data_dir, \"egrid_historical_files_1996-2016\")\n", + " shapefile_dir = os.path.join(local_data_dir, \"egrid_subregions_2014\")\n", + "\n", + " # Download and extract data files\n", + " download_file(data_file_url, data_file_zip_path)\n", + " extract_zip(data_file_zip_path, data_file_dir)\n", + "\n", + " # Download and extract shapefile\n", + " download_file(shapefile_url, shapefile_zip_path)\n", + " extract_zip(shapefile_zip_path, shapefile_dir)\n", + "\n", + " # Adjust data_file_dir to account for nested directories\n", + " nested_data_dir = os.path.join(data_file_dir, \"egrid_historical_files_1996-2016\")\n", + " if os.path.exists(nested_data_dir):\n", + " data_file_dir = nested_data_dir\n", + "\n", + " # Path to the 2014 data file\n", + " data_file_path = os.path.join(data_file_dir, \"eGRID2014_Data_v2.xlsx\")\n", + "\n", + " # Verify that the data file exists\n", + " if not os.path.exists(data_file_path):\n", + " print(f\"Data file {data_file_path} not found.\")\n", + " return None\n", + "\n", + " # Load carbon intensity data from the Excel file\n", + " try:\n", + " # Read the specific worksheet and columns\n", + " intensity_df = pd.read_excel(data_file_path, sheet_name=\"SRL14\", usecols=[\"SUBRGN\", \"SRC2ERTA\"], skiprows=1)\n", + " intensity_df[\"SRC2ERTA\"] = pd.to_numeric(intensity_df[\"SRC2ERTA\"], errors=\"coerce\")\n", + " intensity_df = intensity_df.dropna(subset=[\"SRC2ERTA\"])\n", + "\n", + " # Convert lbs to kg\n", + " intensity_df[\"kg_per_mwh\"] = intensity_df[\"SRC2ERTA\"] * LBS_PER_KG\n", + "\n", + " # Keep only the required columns\n", + " intensity_df = intensity_df[['SUBRGN', 'kg_per_mwh']]\n", + "\n", + " # Remove Puerto Rico (not available in 2014)\n", + " intensity_df = intensity_df[~intensity_df['SUBRGN'].str.contains('PR', na=False)]\n", + "\n", + " print(\"Sample of carbon intensity data for 2014:\")\n", + " print(intensity_df.head())\n", + "\n", + " except Exception as e:\n", + " print(f\"Failed to read eGRID data for 2014: {e}\")\n", + " return None\n", + "\n", + " return intensity_df\n", + "\n", + "# Function to process data for a given year (2020 or 2022)\n", + "def process_year_data(year, urls, include_puerto_rico=True):\n", + " print(f\"Processing {year} data...\")\n", + "\n", + " # Paths for Excel and subregions files\n", + " egrid_xlsx_url = urls['egrid_xlsx']\n", + " egrid_xlsx_path = os.path.join(local_data_dir, f\"egrid{year}_data.xlsx\")\n", + "\n", + " subregions_url = urls['subregions_kmz']\n", + " subregions_path = os.path.join(local_data_dir, f\"egrid{year}_subregions.kmz\")\n", + "\n", + " # Download files\n", + " download_file(egrid_xlsx_url, egrid_xlsx_path)\n", + " download_file(subregions_url, subregions_path)\n", + "\n", + " # Load carbon intensity data\n", + " try:\n", + " sheet_name = f'SRL{str(year)[-2:]}'\n", + " egrid_intensity_df = pd.read_excel(egrid_xlsx_path, sheet_name=sheet_name)\n", + "\n", + " # Use the relevant column for CO₂ intensity\n", + " co2_column_name = [col for col in egrid_intensity_df.columns if 'CO2 total output emission rate (lb/MWh)' in col]\n", + " if co2_column_name:\n", + " co2_column_name = co2_column_name[0]\n", + " else:\n", + " raise ValueError(\"CO₂ emission rate column not found.\")\n", + "\n", + " # Extract subregion acronym and carbon intensity\n", + " region_intensity = egrid_intensity_df[['eGRID subregion acronym', co2_column_name]].copy()\n", + " region_intensity.columns = ['SUBRGN', 'SRC2ERTA']\n", + "\n", + " # Convert to numeric and handle NaNs\n", + " region_intensity['SRC2ERTA'] = pd.to_numeric(region_intensity['SRC2ERTA'], errors='coerce')\n", + " region_intensity = region_intensity.dropna(subset=['SRC2ERTA'])\n", + "\n", + " # Convert lbs to kg\n", + " region_intensity['kg_per_mwh'] = region_intensity['SRC2ERTA'] * LBS_PER_KG\n", + " region_intensity = region_intensity[['SUBRGN', 'kg_per_mwh']]\n", + " region_intensity['SUBRGN'] = region_intensity['SUBRGN'].astype(str)\n", + "\n", + " # Optionally remove Puerto Rico\n", + " if not include_puerto_rico:\n", + " region_intensity = region_intensity[~region_intensity['SUBRGN'].str.contains('PR', na=False)]\n", + "\n", + " print(f\"Sample of carbon intensity data for {year}:\")\n", + " print(region_intensity.head())\n", + "\n", + " except Exception as e:\n", + " print(f\"Failed to read eGRID data for {year}: {e}\")\n", + " return None, None\n", + "\n", + " # Load subregions into GeoDataFrame\n", + " try:\n", + " # Extract the KML file from the KMZ\n", + " with ZipFile(subregions_path, 'r') as kmz:\n", + " kml_filename = [name for name in kmz.namelist() if name.endswith('.kml')][0]\n", + " kmz.extract(kml_filename, local_data_dir)\n", + " kml_path = os.path.join(local_data_dir, kml_filename)\n", + " gdf = gpd.read_file(kml_path, driver='KML')\n", + " os.remove(kml_path)\n", + "\n", + " # Check for 'Name' column\n", + " if 'Name' not in gdf.columns:\n", + " raise ValueError(\"Subregion name column not found in geospatial data.\")\n", + "\n", + " # Optionally remove Puerto Rico from the geodataframe\n", + " if not include_puerto_rico:\n", + " gdf = gdf[~gdf['Name'].str.contains('PR', na=False)]\n", + "\n", + " # Ensure geometries are in WGS84 CRS (EPSG:4326)\n", + " if gdf.crs != 'EPSG:4326':\n", + " print(f\"Reprojecting geometries to WGS84 for {year}.\")\n", + " gdf = gdf.to_crs(epsg=4326)\n", + "\n", + " print(f\"Subregions in {year} geodataframe: {gdf['Name'].unique()}\")\n", + "\n", + " except Exception as e:\n", + " print(f\"Failed to read subregions for {year}: {e}\")\n", + " return None, None\n", + "\n", + " return region_intensity, gdf\n", + "\n", + "# Adjust regions (Alaska, Hawaii, and Puerto Rico)\n", + "def adjust_regions_delta(gdf):\n", + " # Define subregions for Alaska, Hawaii, and Puerto Rico\n", + " alaska_subregions = ['AKGD', 'AKMS']\n", + " hawaii_subregions = ['HIMS', 'HIOA', 'HIMS/HIOA']\n", + " puerto_rico_subregions = ['PR', 'PRMS', 'PRMSA']\n", + "\n", + " # Separate the regions\n", + " mainland_gdf = gdf[~gdf['Name'].isin(alaska_subregions + hawaii_subregions + puerto_rico_subregions)]\n", + " alaska_gdf = gdf[gdf['Name'].isin(alaska_subregions)]\n", + " hawaii_gdf = gdf[gdf['Name'].isin(hawaii_subregions)]\n", + " puerto_rico_gdf = gdf[gdf['Name'].isin(puerto_rico_subregions)]\n", + "\n", + " # Alaska: Scale and translate\n", + " if not alaska_gdf.empty:\n", + " alaska_gdf = alaska_gdf.copy()\n", + "\n", + " # Compute the centroid of Alaska (combined geometries)\n", + " alaska_centroid = alaska_gdf.geometry.unary_union.centroid\n", + "\n", + " # Desired location for Alaska after translation\n", + " alaska_desired_location = (-120, 27) # Adjust as needed\n", + "\n", + " # Apply scaling with the common origin (alaska_centroid)\n", + " alaska_gdf['geometry'] = alaska_gdf['geometry'].apply(\n", + " lambda geom: scale(geom, xfact=0.41, yfact=0.5, origin=alaska_centroid)\n", + " )\n", + "\n", + " # After scaling, compute the new centroid\n", + " scaled_alaska_centroid = alaska_gdf.geometry.unary_union.centroid\n", + "\n", + " # Compute translation amounts based on the desired location\n", + " delta_x = alaska_desired_location[0] - scaled_alaska_centroid.x\n", + " delta_y = alaska_desired_location[1] - scaled_alaska_centroid.y\n", + "\n", + " # Apply translation to each geometry\n", + " alaska_gdf['geometry'] = alaska_gdf['geometry'].apply(\n", + " lambda geom: translate(geom, xoff=delta_x, yoff=delta_y)\n", + " )\n", + " else:\n", + " print(\"Alaska data is not available.\")\n", + "\n", + " # Hawaii: Scale and translate\n", + " if not hawaii_gdf.empty:\n", + " hawaii_gdf = hawaii_gdf.copy()\n", + " hawaii_centroid = hawaii_gdf.geometry.unary_union.centroid\n", + "\n", + " # Apply scaling\n", + " hawaii_gdf['geometry'] = hawaii_gdf['geometry'].apply(\n", + " lambda geom: scale(geom, xfact=1.5, yfact=1.5, origin=hawaii_centroid)\n", + " )\n", + "\n", + " # Desired location for Hawaii after translation\n", + " hawaii_desired_location = (-105, 22) # Adjust as needed\n", + " delta_x = hawaii_desired_location[0] - hawaii_centroid.x\n", + " delta_y = hawaii_desired_location[1] - hawaii_centroid.y\n", + "\n", + " # Apply translation\n", + " hawaii_gdf['geometry'] = hawaii_gdf['geometry'].apply(\n", + " lambda geom: translate(geom, xoff=delta_x, yoff=delta_y)\n", + " )\n", + " else:\n", + " print(\"Hawaii data is not available.\")\n", + "\n", + " # Puerto Rico: Scale and translate\n", + " if not puerto_rico_gdf.empty:\n", + " puerto_rico_gdf = puerto_rico_gdf.copy()\n", + " pr_centroid = puerto_rico_gdf.geometry.unary_union.centroid\n", + "\n", + " # Apply scaling\n", + " puerto_rico_gdf['geometry'] = puerto_rico_gdf['geometry'].apply(\n", + " lambda geom: scale(geom, xfact=2.5, yfact=2.5, origin=pr_centroid)\n", + " )\n", + "\n", + " # Desired location for Puerto Rico after translation\n", + " pr_desired_location = (-75, 20) # Adjust as needed\n", + " delta_x = pr_desired_location[0] - pr_centroid.x\n", + " delta_y = pr_desired_location[1] - pr_centroid.y\n", + "\n", + " # Apply translation\n", + " puerto_rico_gdf['geometry'] = puerto_rico_gdf['geometry'].apply(\n", + " lambda geom: translate(geom, xoff=delta_x, yoff=delta_y)\n", + " )\n", + " else:\n", + " print(\"Puerto Rico data is not available.\")\n", + "\n", + " # Combine all regions back into a single GeoDataFrame\n", + " combined_gdf = pd.concat(\n", + " [mainland_gdf, alaska_gdf, hawaii_gdf, puerto_rico_gdf], ignore_index=True\n", + " )\n", + "\n", + " return combined_gdf\n", + "\n", + "# Process data for 2014\n", + "intensity_df_2014 = process_2014_data(egrid_urls[2014])\n", + "if intensity_df_2014 is None:\n", + " print(\"Failed to process 2014 data.\")\n", + " exit()\n", + "\n", + "# Process data for 2020, excluding Puerto Rico for 2014-2020 delta\n", + "region_intensity_2020_no_pr, gdf_2020_no_pr = process_year_data(2020, egrid_urls[2020], include_puerto_rico=False)\n", + "if region_intensity_2020_no_pr is None or gdf_2020_no_pr is None:\n", + " print(\"Failed to process 2020 data.\")\n", + " exit()\n", + "\n", + "# Merge 2014 and 2020 data on 'SUBRGN'\n", + "merged_intensity_2014_2020 = pd.merge(intensity_df_2014, region_intensity_2020_no_pr, on='SUBRGN', suffixes=('_2014', '_2020'))\n", + "\n", + "# Compute the delta\n", + "merged_intensity_2014_2020['delta_kg_per_mwh'] = merged_intensity_2014_2020['kg_per_mwh_2020'] - merged_intensity_2014_2020['kg_per_mwh_2014']\n", + "\n", + "print(\"Merged intensity data with delta from 2014 to 2020:\")\n", + "print(merged_intensity_2014_2020.head())\n", + "\n", + "# Merge with the 2020 geodataframe\n", + "gdf_delta_2014_2020 = gdf_2020_no_pr.merge(merged_intensity_2014_2020[['SUBRGN', 'delta_kg_per_mwh']], left_on='Name', right_on='SUBRGN', how='left')\n", + "\n", + "# Adjust regions\n", + "gdf_delta_2014_2020 = adjust_regions_delta(gdf_delta_2014_2020)\n", + "\n", + "gdf_delta_2014_2020['geometry'] = gdf_delta_2014_2020['geometry'].simplify(tolerance=0.1)\n", + "\n", + "\n", + "# Plot delta from 2014 to 2020\n", + "fig, ax = plt.subplots(figsize=(12, 8))\n", + "\n", + "# Define diverging colormap\n", + "cmap = plt.cm.RdBu_r\n", + "\n", + "# Define vmax and vmin for color scaling\n", + "vmax = gdf_delta_2014_2020['delta_kg_per_mwh'].abs().max()\n", + "vmin = -vmax\n", + "\n", + "gdf_delta_2014_2020.plot(\n", + " column='delta_kg_per_mwh',\n", + " cmap=cmap,\n", + " linewidth=0.8,\n", + " ax=ax,\n", + " edgecolor='0.8',\n", + " vmax=vmax,\n", + " vmin=vmin\n", + ")\n", + "\n", + "# Define a function to calculate brightness based on RGB values\n", + "def is_dark_color(value, cmap, vmin, vmax):\n", + " \"\"\"Returns True if the color is dark, False otherwise.\"\"\"\n", + " norm_value = (value - vmin) / (vmax - vmin) # Normalize value to be between 0 and 1\n", + " rgb = cmap(norm_value)[:3] # Get RGB from colormap\n", + " brightness = (0.299 * rgb[0] + 0.587 * rgb[1] + 0.114 * rgb[2]) # Perceived brightness formula\n", + " return brightness < 0.5 # Threshold to determine if color is \"dark\"\n", + "\n", + "# Add labels for each subregion\n", + "gdf_delta_2014_2020['coords'] = gdf_delta_2014_2020['geometry'].apply(lambda x: x.representative_point().coords[0])\n", + "for idx, row in gdf_delta_2014_2020.iterrows():\n", + " x, y = row['coords']\n", + " # Manually adjust positions for overlapping labels\n", + " if row['Name'] == 'NYLI':\n", + " x_offset = 0.9\n", + " y_offset = -0.9\n", + " elif row['Name'] == 'NYCW':\n", + " x_offset = 0.5\n", + " y_offset = 1\n", + " elif row['Name'] == 'PRMS':\n", + " x_offset = 0\n", + " y_offset = 0.4\n", + " else:\n", + " x_offset = 0\n", + " y_offset = 0\n", + "\n", + " # Determine color based on brightness\n", + " color = 'white' if is_dark_color(row['delta_kg_per_mwh'], cmap, vmin, vmax) else 'black'\n", + " \n", + " ax.annotate(\n", + " text=row['Name'],\n", + " xy=(x + x_offset, y + y_offset),\n", + " horizontalalignment='center',\n", + " fontsize=11,\n", + " color=color,\n", + " # path_effects=[pe.withStroke(linewidth=1.5, foreground=\"black\")]\n", + " )\n", + "\n", + "# Customize plot appearance\n", + "ax.set_xlim(-130, -65)\n", + "ax.set_ylim(17, 50)\n", + "\n", + "# ax.set_title(\"Change in eGRID Subregion CO₂ Intensity from 2014 to 2020 (kg/MWh)\", fontsize=15)\n", + "ax.axis('off')\n", + "\n", + "cbar_ax = fig.add_axes([0.92, 0.3, 0.02, 0.4]) \n", + "\n", + "# Create the colorbar using the custom axis\n", + "sm = plt.cm.ScalarMappable(cmap=cmap, norm=plt.Normalize(vmin=vmin, vmax=vmax))\n", + "sm._A = []\n", + "cbar = fig.colorbar(sm, cax=cbar_ax, pad=0.1)\n", + "cbar.set_label('Change in kg CO₂e/MWh', fontsize=12)\n", + "\n", + "plt.tight_layout()\n", + "plt.savefig(\"egrid_delta_2014_2020.pdf\", dpi=150)\n", + "plt.savefig(\"egrid_delta_2014_2020.png\", dpi=300, transparent=True)\n", + "plt.show()\n", + "\n", + "# Now process data for 2020 and 2022, including Puerto Rico\n", + "region_intensity_2020_with_pr, gdf_2020_with_pr = process_year_data(2020, egrid_urls[2020], include_puerto_rico=True)\n", + "region_intensity_2022, gdf_2022 = process_year_data(2022, egrid_urls[2022], include_puerto_rico=True)\n", + "\n", + "if region_intensity_2020_with_pr is None or gdf_2020_with_pr is None or region_intensity_2022 is None or gdf_2022 is None:\n", + " print(\"Failed to process 2020 or 2022 data.\")\n", + " exit()\n", + "\n", + "# Merge 2020 and 2022 data on 'SUBRGN'\n", + "merged_intensity_2020_2022 = pd.merge(region_intensity_2020_with_pr, region_intensity_2022, on='SUBRGN', suffixes=('_2020', '_2022'))\n", + "\n", + "# Compute the delta\n", + "merged_intensity_2020_2022['delta_kg_per_mwh'] = merged_intensity_2020_2022['kg_per_mwh_2022'] - merged_intensity_2020_2022['kg_per_mwh_2020']\n", + "\n", + "print(\"Merged intensity data with delta from 2020 to 2022:\")\n", + "print(merged_intensity_2020_2022.head())\n", + "\n", + "# Merge with the 2022 geodataframe\n", + "gdf_delta_2020_2022 = gdf_2022.merge(merged_intensity_2020_2022[['SUBRGN', 'delta_kg_per_mwh']], left_on='Name', right_on='SUBRGN', how='left')\n", + "\n", + "# Adjust regions\n", + "gdf_delta_2020_2022 = adjust_regions_delta(gdf_delta_2020_2022)\n", + "\n", + "gdf_delta_2020_2022['geometry'] = gdf_delta_2020_2022['geometry'].simplify(tolerance=0.1)\n", + "\n", + "\n", + "# Plot delta from 2020 to 2022\n", + "fig, ax = plt.subplots(figsize=(12, 8))\n", + "\n", + "# Define diverging colormap\n", + "cmap = plt.cm.RdBu_r\n", + "\n", + "# Define vmax and vmin for color scaling\n", + "# vmax = gdf_delta_2020_2022['delta_kg_per_mwh'].abs().max()\n", + "vmax = 75\n", + "vmin = -75\n", + "# vmin = -vmax\n", + "\n", + "gdf_delta_2020_2022.plot(\n", + " column='delta_kg_per_mwh',\n", + " cmap=cmap,\n", + " linewidth=0.8,\n", + " ax=ax,\n", + " edgecolor='0.8',\n", + " vmax=vmax,\n", + " vmin=vmin\n", + ")\n", + "\n", + "# Add labels for each subregion\n", + "gdf_delta_2020_2022['coords'] = gdf_delta_2020_2022['geometry'].apply(lambda x: x.representative_point().coords[0])\n", + "for idx, row in gdf_delta_2020_2022.iterrows():\n", + " x, y = row['coords']\n", + " # Manually adjust positions for overlapping labels\n", + " if row['Name'] == 'NYLI':\n", + " x_offset = 0.9\n", + " y_offset = -0.9\n", + " elif row['Name'] == 'NYCW':\n", + " x_offset = 0.5\n", + " y_offset = 1\n", + " elif row['Name'] == 'PRMS':\n", + " x_offset = 0\n", + " y_offset = 0.4\n", + " else:\n", + " x_offset = 0\n", + " y_offset = 0\n", + "\n", + " # Determine color based on brightness\n", + " color = 'white' if is_dark_color(row['delta_kg_per_mwh'], cmap, vmin, vmax) else 'black'\n", + "\n", + " if row['Name'] == 'NYCW':\n", + " color='black'\n", + " ax.annotate(\n", + " text=row['Name'],\n", + " xy=(x + x_offset, y + y_offset),\n", + " horizontalalignment='center',\n", + " fontsize=13,\n", + " color=color,\n", + " # path_effects=[pe.withStroke(linewidth=1.5, foreground=\"black\")]\n", + " )\n", + "\n", + "# Customize plot appearance\n", + "ax.set_xlim(-130, -65)\n", + "ax.set_ylim(17, 50)\n", + "\n", + "# ax.set_title(\"Change in eGRID Subregion CO₂ Intensity from 2020 to 2022 (kg/MWh)\", fontsize=15)\n", + "ax.axis('off')\n", + "\n", + "cbar_ax = fig.add_axes([0.92, 0.3, 0.02, 0.4]) \n", + "\n", + "# Create the colorbar using the custom axis\n", + "sm = plt.cm.ScalarMappable(cmap=cmap, norm=plt.Normalize(vmin=vmin, vmax=vmax))\n", + "sm._A = []\n", + "cbar = fig.colorbar(sm, cax=cbar_ax, pad=0.1)\n", + "cbar.set_label('Change in kg CO₂e/MWh', fontsize=12)\n", + "\n", + "plt.tight_layout()\n", + "plt.savefig(\"egrid_delta_2020_2022.pdf\", dpi=150)\n", + "plt.savefig(\"egrid_delta_2020_2022.png\", dpi=300, transparent=True)\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "ENV3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/scripts/efficiency.ipynb b/scripts/efficiency.ipynb new file mode 100644 index 0000000..bbb3776 --- /dev/null +++ b/scripts/efficiency.ipynb @@ -0,0 +1,367 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import requests\n", + "import zipfile\n", + "import geopandas as gpd\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import json\n", + "import matplotlib.colors as mcolors\n", + "from shapely.geometry import Polygon\n", + "import matplotlib.patches as mpatches\n", + "\n", + "\n", + "\n", + "# Directory to store shapefiles\n", + "shapefile_dir = \"shapefile\"\n", + "os.makedirs(shapefile_dir, exist_ok=True)\n", + "\n", + "# URLs for individual files in the GitHub repository and UACE Census TIGER data\n", + "natural_earth_files = {\n", + " \"ne_10m_admin_0_countries\": [\n", + " \"https://raw.githubusercontent.com/nvkelso/natural-earth-vector/master/10m_cultural/ne_10m_admin_0_countries.shp\",\n", + " \"https://raw.githubusercontent.com/nvkelso/natural-earth-vector/master/10m_cultural/ne_10m_admin_0_countries.shx\",\n", + " \"https://raw.githubusercontent.com/nvkelso/natural-earth-vector/master/10m_cultural/ne_10m_admin_0_countries.dbf\",\n", + " \"https://raw.githubusercontent.com/nvkelso/natural-earth-vector/master/10m_cultural/ne_10m_admin_0_countries.prj\",\n", + " ],\n", + " \"ne_10m_admin_1_states_provinces\": [\n", + " \"https://raw.githubusercontent.com/nvkelso/natural-earth-vector/master/10m_cultural/ne_10m_admin_1_states_provinces.shp\",\n", + " \"https://raw.githubusercontent.com/nvkelso/natural-earth-vector/master/10m_cultural/ne_10m_admin_1_states_provinces.shx\",\n", + " \"https://raw.githubusercontent.com/nvkelso/natural-earth-vector/master/10m_cultural/ne_10m_admin_1_states_provinces.dbf\",\n", + " \"https://raw.githubusercontent.com/nvkelso/natural-earth-vector/master/10m_cultural/ne_10m_admin_1_states_provinces.prj\",\n", + " ]\n", + "}\n", + "\n", + "uace_url = \"https://www2.census.gov/geo/tiger/TIGER2024/UAC20/tl_2024_us_uac20.zip\"\n", + "uace_zip_path = os.path.join(shapefile_dir, \"tl_2024_us_uac20.zip\")\n", + "\n", + "# Function to download each file\n", + "def download_files(file_urls):\n", + " for url in file_urls:\n", + " filename = os.path.join(shapefile_dir, url.split('/')[-1])\n", + " if not os.path.exists(filename):\n", + " print(f\"Downloading {filename}...\")\n", + " response = requests.get(url)\n", + " response.raise_for_status()\n", + " with open(filename, 'wb') as f:\n", + " f.write(response.content)\n", + " print(f\"Downloaded {filename}.\")\n", + " else:\n", + " print(f\"{filename} already exists.\")\n", + "\n", + "# Function to download and extract the UACE ZIP file\n", + "def download_and_extract_uace(url, zip_path):\n", + " if not os.path.exists(zip_path):\n", + " print(f\"Downloading UACE data from {url}...\")\n", + " response = requests.get(url)\n", + " response.raise_for_status()\n", + " with open(zip_path, 'wb') as f:\n", + " f.write(response.content)\n", + " print(f\"Downloaded {zip_path}.\")\n", + " \n", + " # Extract UACE shapefile\n", + " with zipfile.ZipFile(zip_path, 'r') as zip_ref:\n", + " zip_ref.extractall(shapefile_dir)\n", + " print(f\"Extracted UACE shapefiles to {shapefile_dir}.\")\n", + "\n", + "# Download Natural Earth files\n", + "for file_urls in natural_earth_files.values():\n", + " download_files(file_urls)\n", + "\n", + "# Download and extract UACE shapefile\n", + "download_and_extract_uace(uace_url, uace_zip_path)\n", + "\n", + "# Load the shapefiles using the original filenames\n", + "world = gpd.read_file(os.path.join(shapefile_dir, \"ne_10m_admin_0_countries.shp\"))\n", + "states_provinces = gpd.read_file(os.path.join(shapefile_dir, \"ne_10m_admin_1_states_provinces.shp\"))\n", + "uace_shapefile = gpd.read_file(os.path.join(shapefile_dir, \"tl_2024_us_uac20.shp\"))\n", + "\n", + "# Filter for the United States in the world shapefile\n", + "us_boundary = world[world['NAME'] == 'United States of America']\n", + "\n", + "\n", + "def process_state_deltas(state_name, year_ranges, mode_type, metric_column):\n", + " # Determine the modes based on mode_type\n", + " if mode_type == \"bus\":\n", + " modes = ['MB', 'RB', 'CB']\n", + " title_suffix = \"Bus Modes\"\n", + " elif mode_type == \"train\":\n", + " modes = ['LR', 'HR', 'YR', 'CR']\n", + " title_suffix = \"Train Modes\"\n", + " else:\n", + " raise ValueError(\"Invalid mode_type. Use 'bus' or 'train'.\")\n", + "\n", + " # Define the column title for plot labels based on the metric\n", + " # CHANGE TO WH/KM: If we're dealing with \"All Fuels (Wh/pkm),\" rename to Wh/km\n", + " if metric_column == \"All Fuels (Wh/pkm)\":\n", + " metric_title = \"Fuel Efficiency (Wh/km)\"\n", + " elif metric_column == \"Average Passengers\":\n", + " metric_title = \"Average Passengers\"\n", + " else:\n", + " metric_title = metric_column\n", + "\n", + " def load_year_data(year):\n", + " json_file = f\"../src/emcommon/resources/ntd{year}_intensities.json\"\n", + " with open(json_file, 'r') as f:\n", + " data = json.load(f)\n", + " df = pd.DataFrame(data['records'])\n", + "\n", + " mode_df = df[df['Mode'].isin(modes)]\n", + "\n", + " mode_df[metric_column] = pd.to_numeric(mode_df[metric_column], errors='coerce')\n", + " mode_df['Unlinked Passenger Trips'] = pd.to_numeric(mode_df['Unlinked Passenger Trips'], errors='coerce')\n", + " mode_df['Average Passengers'] = pd.to_numeric(mode_df['Average Passengers'], errors='coerce')\n", + " mode_df = mode_df.dropna(subset=[metric_column, 'Unlinked Passenger Trips', 'UACE Code', 'Average Passengers'])\n", + "\n", + " # CHANGE TO WH/KM: Convert Wh/pkm to Wh/km by multiplying by Average Passengers\n", + " if metric_column == \"All Fuels (Wh/pkm)\":\n", + " mode_df[metric_column] = mode_df[metric_column] * mode_df[\"Average Passengers\"]\n", + "\n", + " mode_df['UACE Code'] = mode_df['UACE Code'].astype(str).str.zfill(5)\n", + " grouped = mode_df.groupby('UACE Code').apply(\n", + " lambda x: pd.Series({\n", + " f'Weighted {metric_column}': (x[metric_column] * x['Unlinked Passenger Trips']).sum() / x['Unlinked Passenger Trips'].sum(),\n", + " })\n", + " ).reset_index()\n", + "\n", + " return grouped\n", + "\n", + " # Determine the number of plots based on the number of year ranges\n", + " num_plots = len(year_ranges)\n", + " figsize = (9 * num_plots, 8) # Adjust the figure size based on the number of plots\n", + "\n", + " # Create a figure with the appropriate number of subplots\n", + " fig, axs = plt.subplots(1, num_plots, figsize=figsize, constrained_layout=True)\n", + " if num_plots == 1:\n", + " axs = [axs] # Ensure axs is always a list for consistency\n", + "\n", + " fig.subplots_adjust(top=0.88) # Add space above subplots\n", + " titley = 0.93\n", + " if state_name == \"North Carolina\":\n", + " titley = 0.76\n", + " elif state_name == \"Massachusetts\":\n", + " titley = 0.82\n", + " elif state_name == \"Colorado\":\n", + " titley = 0.86\n", + " fig.suptitle(f\"{state_name} {mode_type.title()} - {metric_title}\", fontsize=20, y=titley)\n", + "\n", + " # Define color normalization and color map based on metric and mode type\n", + " if metric_column == \"All Fuels (Wh/pkm)\":\n", + " norm = mcolors.Normalize(vmin=-800, vmax=800)\n", + " elif metric_column == \"Average Passengers\":\n", + " if mode_type == \"bus\":\n", + " norm = mcolors.Normalize(vmin=-4, vmax=4)\n", + " elif mode_type == \"train\":\n", + " norm = mcolors.Normalize(vmin=-40, vmax=40)\n", + " else:\n", + " norm = mcolors.Normalize(vmin=-4, vmax=4) # Default case if needed\n", + "\n", + " if metric_column == \"All Fuels (Wh/pkm)\":\n", + " cmap = 'RdYlGn_r'\n", + " else:\n", + " cmap = 'RdYlGn'\n", + "\n", + " # Filter for the specified state\n", + " state = states_provinces[states_provinces['name'] == state_name]\n", + " state_boundary = state.unary_union # Get the boundary as a single geometry\n", + "\n", + " for ax, (year_start, year_end) in zip(axs, year_ranges):\n", + " # Load data for both years\n", + " data_start = load_year_data(year_start)\n", + " data_end = load_year_data(year_end)\n", + "\n", + " # Merge data for both years to calculate the delta\n", + " data_delta = data_start.merge(data_end, on=\"UACE Code\", suffixes=(f'_{year_start}', f'_{year_end}'))\n", + " data_delta[f'Delta {metric_column}'] = data_delta[f'Weighted {metric_column}_{year_end}'] - data_delta[f'Weighted {metric_column}_{year_start}']\n", + "\n", + " # Merge with UACE shapefile for spatial plotting\n", + " uace_shapefile['GEOID20'] = uace_shapefile['GEOID20'].astype(str).str.zfill(5)\n", + " merged_gdf = uace_shapefile.merge(data_delta, left_on='GEOID20', right_on='UACE Code', how='inner')\n", + "\n", + " # Identify UACE regions with data and without data (no data)\n", + " uace_with_data = merged_gdf.copy()\n", + " uace_no_data = uace_shapefile[~uace_shapefile['GEOID20'].isin(uace_with_data['GEOID20'])]\n", + "\n", + " # Perform spatial intersection to clip UACE regions to the state boundary\n", + " clipped_with_data = gpd.overlay(uace_with_data, gpd.GeoDataFrame(geometry=[state_boundary], crs=uace_with_data.crs), how='intersection')\n", + " clipped_no_data = gpd.overlay(uace_no_data, gpd.GeoDataFrame(geometry=[state_boundary], crs=uace_no_data.crs), how='intersection')\n", + "\n", + " # Plot the delta values for the specified time range\n", + " clipped_with_data.plot(\n", + " ax=ax,\n", + " column=f'Delta {metric_column}',\n", + " cmap=cmap,\n", + " legend=False,\n", + " edgecolor='grey',\n", + " linewidth=0.5,\n", + " norm=norm,\n", + " )\n", + "\n", + " # Plot the UACE regions with no data using hatch pattern\n", + " clipped_no_data.plot(\n", + " ax=ax,\n", + " color=\"none\",\n", + " edgecolor=\"grey\",\n", + " hatch=\"////////\",\n", + " linewidth=0.5,\n", + " )\n", + "\n", + " # Overlay state boundary\n", + " state.plot(ax=ax, color=\"none\", edgecolor=\"black\", linewidth=1.5)\n", + "\n", + " # Set title and bounds for each subplot\n", + " ax.set_title(f\"{year_start}-{year_end} Delta\", fontsize=16)\n", + " if state_name == \"Massachusetts\":\n", + " ax.set_xlim([-73.5, -69.9])\n", + " ax.set_ylim([41.2, 42.9])\n", + " elif state_name == \"Colorado\":\n", + " ax.set_xlim([-109.1, -102.0])\n", + " ax.set_ylim([36.9, 41.0])\n", + " elif state_name == \"North Carolina\":\n", + " ax.set_xlim([-84.3, -75.5])\n", + " ax.set_ylim([33.8, 36.6])\n", + "\n", + " ax.set_aspect('equal')\n", + "\n", + " # Add a single colorbar for the entire figure\n", + " sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)\n", + " sm._A = [] # Dummy array for colorbar\n", + " cbar = fig.colorbar(sm, ax=axs, orientation='vertical', fraction=0.02, pad=0.04, shrink=0.5)\n", + " cbar.set_label(f\"Delta {metric_title}\")\n", + "\n", + " # Create a custom legend entry for \"No Data\" with hatching\n", + " no_data_patch = mpatches.Patch(facecolor=\"none\", edgecolor=\"grey\", hatch=\"////////\", label=\"No Data\")\n", + " if state_name == \"North Carolina\":\n", + " patch_y = 0.25\n", + " else:\n", + " patch_y = 0.14\n", + " fig.legend(handles=[no_data_patch], loc=\"lower right\", bbox_to_anchor=(0.93, patch_y), fontsize=12)\n", + "\n", + " # Save and display the combined figure for the state and mode type\n", + " year_range_str = \"-\".join([f\"{start}_{end}\" for start, end in year_ranges])\n", + " plt.savefig(f\"delta_{metric_column.replace(' ', '_').replace('/', '_').replace('(', '').replace(')', '').lower()}_{title_suffix.replace(' ', '_').lower()}_{state_name.replace(' ', '_').lower()}_{year_range_str}.pdf\")\n", + "\n", + " plt.show()\n", + "\n", + "\n", + "# Specify states and year ranges for deltas\n", + "states = [\"Massachusetts\", \"Colorado\", \"North Carolina\"]\n", + "# year_ranges = [(2018, 2020), (2020, 2022)]\n", + "year_ranges = [(2018, 2022)]\n", + "\n", + "# Generate delta plots for each state and mode type for both metrics\n", + "# for mode_type in [\"bus\", \"train\"]:\n", + "for mode_type in [\"bus\"]:\n", + " for state in states:\n", + " process_state_deltas(state, year_ranges, mode_type, \"All Fuels (Wh/pkm)\") # For fuel efficiency\n", + " process_state_deltas(state, year_ranges, mode_type, \"Average Passengers\") # For average occupancy\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "def calculate_national_and_uace_average_passengers(years, uace=None, mode_type=\"bus\"):\n", + " \"\"\"\n", + " Calculate the weighted national average passengers and for a specific UACE.\n", + " \n", + " Parameters:\n", + " years (list): List of years to process.\n", + " uace (str): UACE code to filter on. If None, only calculates national average.\n", + " mode_type (str): Mode type to process (default: 'bus').\n", + " \"\"\"\n", + " # Define bus modes\n", + " if mode_type == \"bus\":\n", + " modes = ['MB', 'RB', 'CB'] # Bus modes\n", + " else:\n", + " raise ValueError(\"Currently only 'bus' mode is supported.\")\n", + "\n", + " # Directory containing the JSON files\n", + " json_dir = \"../src/emcommon/resources/\"\n", + "\n", + " # Loop through each year and calculate averages\n", + " for year in years:\n", + " json_file = os.path.join(json_dir, f\"ntd{year}_intensities.json\")\n", + " if not os.path.exists(json_file):\n", + " print(f\"File for year {year} not found: {json_file}\")\n", + " continue\n", + "\n", + " with open(json_file, 'r') as f:\n", + " data = json.load(f)\n", + "\n", + " # Convert to DataFrame\n", + " df = pd.DataFrame(data['records'])\n", + "\n", + " # Filter for bus modes\n", + " mode_df = df[df['Mode'].isin(modes)]\n", + "\n", + " # Ensure numeric columns\n", + " mode_df['Average Passengers'] = pd.to_numeric(mode_df['Average Passengers'], errors='coerce')\n", + " mode_df['Unlinked Passenger Trips'] = pd.to_numeric(mode_df['Unlinked Passenger Trips'], errors='coerce')\n", + "\n", + " # Drop rows with missing values\n", + " mode_df = mode_df.dropna(subset=['Average Passengers', 'Unlinked Passenger Trips'])\n", + "\n", + " # Calculate the weighted national average for passengers\n", + " total_passenger_trips = mode_df['Unlinked Passenger Trips'].sum()\n", + " weighted_avg_passengers = (mode_df['Average Passengers'] * mode_df['Unlinked Passenger Trips']).sum() / total_passenger_trips\n", + " print(f\"Year {year}: Weighted National Average Passengers for Bus = {weighted_avg_passengers:.2f}\")\n", + "\n", + " # If UACE is provided, filter and calculate for specific UACE\n", + " if uace:\n", + " uace_df = mode_df[mode_df['UACE Code'] == str(uace)]\n", + "\n", + " if not uace_df.empty:\n", + " total_uace_trips = uace_df['Unlinked Passenger Trips'].sum()\n", + " weighted_uace_avg_passengers = (uace_df['Average Passengers'] * uace_df['Unlinked Passenger Trips']).sum() / total_uace_trips\n", + " print(f\"Year {year}, UACE {uace}: Weighted Average Passengers for Bus = {weighted_uace_avg_passengers:.2f}\")\n", + " else:\n", + " print(f\"Year {year}: No data for UACE {uace}\")\n", + "\n", + "# Define the years to process\n", + "years = [2018, 2019, 2020, 2021, 2022]\n", + "\n", + "# Call the function with UACE filtering\n", + "calculate_national_and_uace_average_passengers(years, uace=\"23527\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "ENV3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}