68 - [feat] Check for Project Updates (#77)

* [feat] scrape projects from ORES website and insert/update Supabase * [chore] commented out tests * [chore] small changes - solar pv, nyserda contract kdm completed/date, tendering of interconnection agreement kdm rename * [feat] updates for nyserda webscraper, also moved reverse geocoding of small solar projects to database.py * [feat] moved geocoding and kdm updating for ores to database, wrote function to delete nyiso withdrawn functions, include last_updated field for nyiso * [chore] temporarily commented out function to check/delete withdrawn nyiso projects * [fix] fix for datestring representation for Winning a contract award from NYSERDA kdm * [refactor] update NYSERDA large scale to track last_updated as a dict * [feat] updated last_updated dict type for all datasources, added function descriptions for database and scraper_utils * [chore] commenting out functions to delete cancelled and withdrawn projects for now
calblueprint · Nov 21, 2024 · cfbcef6 · cfbcef6
1 parent da9b5dd
commit cfbcef6
Show file tree

Hide file tree

Showing 9 changed files with 673 additions and 409 deletions.
diff --git a/api/webscraper/__pycache__/database_constants.cpython-312.pyc b/api/webscraper/__pycache__/database_constants.cpython-312.pyc
diff --git a/api/webscraper/__pycache__/nyiso_scraper.cpython-312.pyc b/api/webscraper/__pycache__/nyiso_scraper.cpython-312.pyc
diff --git a/api/webscraper/__pycache__/nyserda_scraper.cpython-312.pyc b/api/webscraper/__pycache__/nyserda_scraper.cpython-312.pyc
diff --git a/api/webscraper/database.py b/api/webscraper/database.py
diff --git a/api/webscraper/database_constants.py b/api/webscraper/database_constants.py
@@ -1,23 +1,25 @@
+# maps renewable energy strings to our app's renewable energy names
 renewable_energy_map = {
     "Hydroelectric": "Hydroelectric",
     "Land Based Wind": "Land-Based Wind",
     "Offshore Wind": "Offshore Wind",
-    "Solar": "Solar",
+    "Solar": "Solar PV",
     "Geothermal": "Geothermal",
     "Energy Storage": "Energy Storage",
     "Pumped Storage": "Pumped Storage",
 }
 
+# maps NYISO abbreviations to renewable energy strings
 renewable_energy_abbreviations = {
     "H": "Hydroelectric",
-    "S": "Solar",
+    "S": "Solar PV",
     "ES": "Energy Storage",
     "PS": "Pumped Storage",
     "W": "Land-Based Wind",
     "OSW": "Offshore Wind",
 }
 
-initial_kdm_dict = [
+initial_kdm = [
     {"milestoneTitle": "Entry to NYISO Queue", "completed": False, "date": None},
     {
         "milestoneTitle": "Application for permit to ORES",
@@ -35,19 +37,33 @@
         "date": None,
     },
     {
-        "milestoneTitle": "Execution of an Interconnection Agreement (IA)",
+        "milestoneTitle": "Tendering of an Interconnection Agreement (IA)",
         "completed": False,
         "date": None,
     },
     {"milestoneTitle": "Start of operations", "completed": False, "date": None},
-    {
-        "milestoneTitle": "Application for permit to ORES",
-        "completed": False,
-        "date": None,
-    },
-    {
-        "milestoneTitle": "Issuance of permit from ORES",
-        "completed": False,
-        "date": None,
-    },
+]
+
+project_fields = [
+    "project_name",
+    "renewable_energy_technology",
+    "project_status",
+    "developer",
+    "town",
+    "county",
+    "region",
+    "size",
+    "latitude",
+    "longitude",
+    "key_development_milestones",
+    "project_image",
+    "interconnection_queue_number",
+    "approved",
+    "state_senate_district",
+    "assembly_district",
+    "zipcode",
+    "proposed_cod",
+    "permit_process",
+    "permit_application_number",
+    "last_updated",
 ]
diff --git a/api/webscraper/nyiso_scraper.py b/api/webscraper/nyiso_scraper.py
@@ -30,6 +30,11 @@ def query_nyiso_excel():
 
 
 def query_nyiso():
+    """
+    Queries for all the projects in the NYISO sheet and filters
+    Outdated - does not modify behavior based on which sheet the project is from (Interconnection Queue, Cluster Projects, In Service)
+    returns: list of dictionaries representing the projects
+    """
     if nyiso_xlsx_href is None:
         print('ERROR: "View the Interconnection Queue" link not found')
         return
@@ -50,7 +55,7 @@ def query_nyiso():
                 continue
             project_dict = {
                 "project_name": item.get("Project Name", None),
-                "project_status": "Proposed",  # TODO: update this based on which sheet it's from
+                "project_status": "Proposed",
                 "renewable_energy_technology": renewable_energy_abbreviations[
                     item.get("Type/ Fuel")
                 ],  # map abbreviations into readable string
@@ -95,7 +100,9 @@ def filter_nyiso_list(project_list, sheet_name):
     for item in project_list:
         if sheet_name == "Interconnection Queue" and item.get("State") != "NY":
             continue
-        elif sheet_name == "Cluster Projects" and item.get("State", None) != "New York":
+        elif sheet_name == "Cluster Projects" and not (
+            item.get("State", None) == "New York" or item.get("State", None) == "NY"
+        ):
             continue
         elif sheet_name == "In Service" and item.get("State", None) != "NY":
             continue
@@ -111,13 +118,22 @@ def filter_nyiso_list(project_list, sheet_name):
             "developer": item.get("Developer Name", None),
             "proposed_cod": item.get(
                 "Proposed COD", None
-            ),  # note: non-serializable into JSON --> can't directly write to file
+            ),  # NOTE: non-serializable into JSON --> can't directly write to file
             "county": item.get("County", None),
             "region": None,  # missing
             "zipcode": None,  # missing
             "latitude": None,
             "longitude": None,
-            # 'data_through_date': item.get('Last Updated Date', None),
+            "nyiso_last_updated": (
+                item.get("Last Updated Date", None)  # NOTE: non-serializable into JSON
+                if (
+                    sheet_name == "Interconnection Queue"
+                    or sheet_name == "Cluster Projects"
+                )
+                else item.get(
+                    "Last Update NaT", None
+                )  # NOTE: the column header for the in-service sheet is called "Last Update NaT"
+            ),
             "key_development_milestones": None,
             "project_image": None,
             "interconnection_queue_number": item.get("Queue Pos.", None),
@@ -155,7 +171,7 @@ def filter_nyiso_cluster_sheet():
     cluster_projects_df = clean_df_data(cluster_projects_df)
     cluster_projects_list = cluster_projects_df.to_dict(orient="records")
 
-    filtered_list = filter_nyiso_list(cluster_projects_list, "Cluster Project")
+    filtered_list = filter_nyiso_list(cluster_projects_list, "Cluster Projects")
     return filtered_list
 
 
@@ -184,9 +200,37 @@ def filter_nyiso_in_service_sheet():
     return filtered_list
 
 
+def filter_nyiso_withdrawn_sheets():
+    """
+    Returns a list of objects containing the key: "project_name" of withdrawn projects
+    """
+    all_sheets = query_nyiso_excel()
+    sheet_names = list(all_sheets.keys())
+    withdrawn_key = sheet_names[2]  # gets the sheet named "Withdrawn"
+    cluster_withdrawn_key = sheet_names[3]  # gets the sheet named "Cluster Withdrawn"
+
+    withdrawn_df = all_sheets[withdrawn_key]
+    withdrawn_df = clean_df_data(withdrawn_df)
+    withdrawn_list = withdrawn_df.to_dict(orient="records")
+
+    cluster_withdrawn_df = all_sheets[cluster_withdrawn_key]
+    cluster_withdrawn_df = clean_df_data(cluster_withdrawn_df)
+    cluster_withdrawn_list = cluster_withdrawn_df.to_dict(orient="records")
+
+    withdrawn_list = withdrawn_list + cluster_withdrawn_list
+    filtered_list = [
+        {"project_name": item.get("Project Name", None)}
+        for item in withdrawn_list
+        if item.get("Project Name", None) is not None
+    ]
+    return filtered_list
+
+
 """
 For testing
 """
 # write_nyiso_to_json()
+# print(filter_nyiso_iq_sheet())
 # print(filter_nyiso_in_service_sheet())
 # print(filter_nyiso_cluster_sheet())
+# print(filter_nyiso_withdrawn_sheets())
diff --git a/api/webscraper/nyserda_scraper.py b/api/webscraper/nyserda_scraper.py
@@ -1,7 +1,7 @@
 import requests
 import json
 from utils.scraper_utils import check_status, geocode_lat_long, standardize_label
-from database_constants import renewable_energy_map, initial_kdm_dict
+from database_constants import renewable_energy_map, initial_kdm
 
 """
 This scrapes data from the NYSERDA Large-scale Renewable Projects database.
@@ -10,6 +10,17 @@
 """
 
 
+def solicitation_name_to_date(solicitation_name):
+    if solicitation_name is None:
+        return None
+    if "-" not in solicitation_name:
+        return None
+    else:
+        parts = solicitation_name.split("-")
+        year = parts[0][-2::]
+        return f"20{year}-01-01"
+
+
 def query_nyserda_large():
     nyserda_large_response = requests.get("https://data.ny.gov/resource/dprp-55ye.json")
     if nyserda_large_response.status_code != 200:
@@ -26,11 +37,7 @@ def query_nyserda_large():
                 if item.get("renewable_technology", None) is not None
                 else None
             )
-            if (
-                check_status(item.get("project_status", None)) != "Cancelled"
-                and item.get("renewable_technology", None)
-                in renewable_energy_map.keys()
-            ):
+            if item.get("renewable_technology", None) in renewable_energy_map.keys():
                 project_dict = {
                     "project_name": item.get("project_name", None),
                     "project_status": check_status(item.get("project_status", None)),
@@ -51,17 +58,19 @@ def query_nyserda_large():
                         if item.get("georeference", None) is not None
                         else None
                     ),
-                    # 'data_through_date': item.get('data_through_date', None),
+                    "data_through_date": item.get("data_through_date").split("T")[0],
                     "permit_process": item.get("permit_process", None),
                     "interconnection_queue_number": item.get(
                         "interconnection_queue_number", None
                     ),
-                    "key_development_milestones": initial_kdm_dict,
+                    "size": item.get("new_renewable_capacity_mw", None),
+                    "key_development_milestones": initial_kdm,
                     "project_image": None,
                     "approved": False,
+                    "proposed_cod": item.get("year_of_delivery_start_date", None),
                     # used for updating the kdms
-                    "year_of_delivery_start_date": item.get(
-                        "year_of_delivery_start_date", None
+                    "nyserda_contract_date": solicitation_name_to_date(
+                        item.get("solicitation_name", None)
                     ),
                 }
                 filtered_list.append(project_dict)
@@ -76,17 +85,15 @@ def write_large_to_json():
         file.write("\n")
 
 
-"""
-This scrapes data from the NYSERDA Statewide Distributed Solar Projects database.
-We filter for specific columns from the database's API and save them to a json file.
-https://data.ny.gov/Energy-Environment/Statewide-Distributed-Solar-Projects-Beginning-200/wgsj-jt5f/about_data
-
-geocode_lat_long is a helper util function that uses the google maps geocoding api to get the estimated
-latitude and longitude of a project based on the town
-"""
-
-
 def query_nyserda_solar(offset=0, limit=1000):
+    """
+    This scrapes data from the NYSERDA Statewide Distributed Solar Projects database.
+    We filter for specific columns from the database's API and save them to a json file.
+    https://data.ny.gov/Energy-Environment/Statewide-Distributed-Solar-Projects-Beginning-200/wgsj-jt5f/about_data
+
+    geocode_lat_long is a helper util function that uses the google maps geocoding api to get the estimated
+    latitude and longitude of a project based on the town
+    """
     nyserda_small_response = requests.get(
         f"https://data.ny.gov/resource/wgsj-jt5f.json?$limit={limit}&$offset={offset}"
     )
@@ -108,45 +115,44 @@ def query_nyserda_solar(offset=0, limit=1000):
 
             if size_in_mw is None or size_in_mw < 2:
                 continue
+            if (
+                item.get("project_id", None) is None
+            ):  # some projects have no project_id, so we skip them
+                continue
 
             if check_status(item.get("project_status", None)) != "Cancelled":
-                if item.get("city_town", None) is not None:
-                    lat, long = geocode_lat_long(f"{item.get('city_town')}, NY")
-                else:
-                    lat, long = None, None
                 project_dict = {
                     "project_name": item.get(
                         "project_id", None
                     ),  # small data set only has project_id
                     "project_status": check_status(
                         item.get("project_status", None)
-                    ),  # missing
+                    ),  # NYSERDA small-scale solar projects do not have a project status
                     "renewable_energy_technology": "Solar",
                     "size": size_in_mw,
                     "developer": item.get("developer", None),
                     "proposed_cod": item.get("interconnection_date", None),
+                    "town": item.get("city_town", None),
                     "county": item.get("county", None),
                     "region": item.get("redc", None),  # missing
                     "zipcode": item.get("zip", None),
-                    "latitude": lat,
-                    "longitude": long,
-                    # 'data_through_date': item.get('data_through_date', None),
-                    "key_development_milestones": initial_kdm_dict,
+                    "latitude": None,
+                    "longitude": None,
+                    "data_through_date": item.get("data_through_date").split("T")[0],
+                    "key_development_milestones": initial_kdm,
                     "project_image": None,
                     "approved": False,
                 }
                 filtered_list.append(project_dict)
         return filtered_list
 
 
-"""
-The NYSERDA Statewide Distributed Solar Projects database has 230,000 records
-However, the API has a default limit of 1,000 rows.
-This function repeatedly queries the API with different offsets to get all the records.
-"""
-
-
 def query_nyserda_solar_repeat():
+    """
+    The NYSERDA Statewide Distributed Solar Projects database has 230,000 records
+    However, the API has a default limit of 1,000 rows.
+    This function repeatedly queries the API with different offsets to get all the records.
+    """
     # TODO: get the total number of records from the database by HTML parsing
     length = 250000
     limit = 1000