26 - [feat] Parse NYISO link from HTML (#43)

* [feat] use BeautifulSoup to parse for interconnection queue xlsx link from NYISO website * [chore] removed extra print statement * [fix] removed proposed_cod from nyserda large-scale database filters. added W to renewable energy map * [refactor] removed unused imports, added back date string formatting in database * [fix] changed Land Based Wind to Land-Based Wind
calblueprint · Oct 28, 2024 · d00bc73 · d00bc73
1 parent 6c021be
commit d00bc73
Show file tree

Hide file tree

Showing 6 changed files with 115 additions and 92 deletions.
diff --git a/.gitignore b/.gitignore
@@ -34,3 +34,6 @@ yarn-error.log*
 # typescript
 *.tsbuildinfo
 next-env.d.ts
+
+/ace-ny-venv
+__pycache__/
diff --git a/api/webscraper/database.py b/api/webscraper/database.py
@@ -15,11 +15,8 @@
 from utils.scraper_utils import (
     create_update_object,
     update_kdm,
-    turn_timestamp_to_string,
 )
 from database_constants import (
-    renewable_energy_map,
-    renewable_energy_set,
     initial_kdm_dict,
 )
 

diff --git a/api/webscraper/database_constants.py b/api/webscraper/database_constants.py
@@ -1,35 +1,43 @@
-renewable_energy_set = {'Hydroelectric', 'Land Based Wind', 'Offshore Wind', 'Solar', 'Geothermal', 'Energy Storage', 'Pumped Storage'}
-
 renewable_energy_map = {
-  'H': 'Hydroelectric',
-  'S': 'Solar',
-  'ES': 'Energy Storage',
-  'PS': 'Pumped Storage',
-  'OSW': 'Offshore Wind',
+    "Hydroelectric": "Hydroelectric",
+    "Land Based Wind": "Land-Based Wind",
+    "Offshore Wind": "Offshore Wind",
+    "Solar": "Solar",
+    "Geothermal": "Geothermal",
+    "Energy Storage": "Energy Storage",
+    "Pumped Storage": "Pumped Storage",
+}
+
+renewable_energy_abbreviations = {
+    "H": "Hydroelectric",
+    "S": "Solar",
+    "ES": "Energy Storage",
+    "PS": "Pumped Storage",
+    "W": "Land-Based Wind",
+    "OSW": "Offshore Wind",
 }
 
-initial_kdm_dict = [{
-  'milestoneTitle': 'Entry to NYISO Queue',
-  'completed': False,
-  'date': None
-},
-  {'milestoneTitle': 'Application for permit to ORES',
-  'completed': False,
-  'date': None
-},
-  {'milestoneTitle': 'Issuance of permit from ORES',
-  'completed': False,
-  'date': None
-},
-  {'milestoneTitle': 'Winning a contract award from NYSERDA',
-  'completed': False,
-  'date': None
-},
-  {'milestoneTitle': 'Execution of an Interconnection Agreement (IA)',
-  'completed': False,
-  'date': None
-},
-  {'milestoneTitle': 'Start of operations',
-  'completed': False,
-  'date': None
-}]
+initial_kdm_dict = [
+    {"milestoneTitle": "Entry to NYISO Queue", "completed": False, "date": None},
+    {
+        "milestoneTitle": "Application for permit to ORES",
+        "completed": False,
+        "date": None,
+    },
+    {
+        "milestoneTitle": "Issuance of permit from ORES",
+        "completed": False,
+        "date": None,
+    },
+    {
+        "milestoneTitle": "Winning a contract award from NYSERDA",
+        "completed": False,
+        "date": None,
+    },
+    {
+        "milestoneTitle": "Execution of an Interconnection Agreement (IA)",
+        "completed": False,
+        "date": None,
+    },
+    {"milestoneTitle": "Start of operations", "completed": False, "date": None},
+]
diff --git a/api/webscraper/nyiso_scraper.py b/api/webscraper/nyiso_scraper.py
@@ -4,63 +4,78 @@
 import json
 from utils.scraper_utils import clean_df_data
 from database_constants import (
-    renewable_energy_map,
-    renewable_energy_set,
-    initial_kdm_dict,
+    renewable_energy_abbreviations,
 )
 
+from bs4 import BeautifulSoup
 
-def query_nyiso_excel():
-    nyiso = requests.get(
-        "https://www.nyiso.com/documents/20142/1407078/NYISO-Interconnection-Queue.xlsx"
-    )
-    nyiso_data = nyiso.content
-    all_sheets = pd.read_excel(BytesIO(nyiso_data), sheet_name=None)
-    return all_sheets
+url = "https://www.nyiso.com/connecting-to-the-grid"
+page = requests.get(url)
 
+soup = BeautifulSoup(page.content, "html.parser")
+interconnection_link = soup.find("a", string="View the Interconnection Queue")
+if interconnection_link is not None:
+    nyiso_xlsx_href = interconnection_link.get("href")
 
-def query_nyiso():
-    nyiso = requests.get(
-        "https://www.nyiso.com/documents/20142/1407078/NYISO-Interconnection-Queue.xlsx"
-    )
-    nyiso_data = nyiso.content
-    nyiso_df = pd.read_excel(BytesIO(nyiso_data))
 
-    nyiso_df = clean_df_data(nyiso_df)
-    nyiso_list = nyiso_df.to_dict(orient="records")
+def query_nyiso_excel():
+    if nyiso_xlsx_href is None:
+        print('ERROR: "View the Interconnection Queue" link not found')
+        return
+    else:
+        nyiso = requests.get(nyiso_xlsx_href)
+        nyiso_data = nyiso.content
+        all_sheets = pd.read_excel(BytesIO(nyiso_data), sheet_name=None)
+        return all_sheets
 
-    filtered_list = []
-    for item in nyiso_list:
-        if item.get("Type/ Fuel", None) not in renewable_energy_map.keys():
-            continue
-        project_dict = {
-            "project_name": item.get("Project Name", None),
-            "project_status": "Proposed",  # TODO: update this based on which sheet it's from
-            "renewable_energy_technology": renewable_energy_map[
-                item.get("Type/ Fuel")
-            ],  # map abbreviations into readable string
-            "size": item.get("SP (MW)", None),
-            "developer": item.get("Developer Name", None),
-            "proposed_cod": item.get(
-                "Proposed COD", None
-            ),  # note: non-serializable into JSON --> can't directly write to file
-            "county": item.get("County", None),
-            "region": None,  # missing
-            "zipcode": None,  # missing
-            "latitude": None,
-            "longitude": None,
-            # 'data_through_date': item.get('Last Updated Date', None),
-            "key_development_milestones": None,
-            "project_image": None,
-            "interconnection_queue_number": item.get("Queue Pos.", None),
-            "approved": False,
-            # the following fields are used for updating kdms when updating the database
-            "date_of_ir": item.get("Date of IR", None),  # already a datetime object
-            "ia_tender_date": item.get("IA Tender Date", None),
-        }
-        filtered_list.append(project_dict)
 
-    return filtered_list
+def query_nyiso():
+    if nyiso_xlsx_href is None:
+        print('ERROR: "View the Interconnection Queue" link not found')
+        return
+    else:
+        nyiso = requests.get(nyiso_xlsx_href)
+        nyiso_data = nyiso.content
+        nyiso_df = pd.read_excel(BytesIO(nyiso_data))
+
+        nyiso_df = clean_df_data(nyiso_df)
+        nyiso_list = nyiso_df.to_dict(orient="records")
+
+        filtered_list = []
+        for item in nyiso_list:
+            if (
+                item.get("Type/ Fuel", None)
+                not in renewable_energy_abbreviations.keys()
+            ):
+                continue
+            project_dict = {
+                "project_name": item.get("Project Name", None),
+                "project_status": "Proposed",  # TODO: update this based on which sheet it's from
+                "renewable_energy_technology": renewable_energy_abbreviations[
+                    item.get("Type/ Fuel")
+                ],  # map abbreviations into readable string
+                "size": item.get("SP (MW)", None),
+                "developer": item.get("Developer Name", None),
+                "proposed_cod": item.get(
+                    "Proposed COD", None
+                ),  # note: non-serializable into JSON --> can't directly write to file
+                "county": item.get("County", None),
+                "region": None,  # missing
+                "zipcode": None,  # missing
+                "latitude": None,
+                "longitude": None,
+                # 'data_through_date': item.get('Last Updated Date', None),
+                "key_development_milestones": None,
+                "project_image": None,
+                "interconnection_queue_number": item.get("Queue Pos.", None),
+                "approved": False,
+                # the following fields are used for updating kdms when updating the database
+                "date_of_ir": item.get("Date of IR", None),  # already a datetime object
+                "ia_tender_date": item.get("IA Tender Date", None),
+            }
+            filtered_list.append(project_dict)
+
+        return filtered_list
 
 
 def write_nyiso_to_json():
@@ -84,12 +99,12 @@ def filter_nyiso_list(project_list, sheet_name):
             continue
         elif sheet_name == "In Service" and item.get("State", None) != "NY":
             continue
-        if item.get("Type/ Fuel", None) not in renewable_energy_map.keys():
+        if item.get("Type/ Fuel", None) not in renewable_energy_abbreviations.keys():
             continue
         project_dict = {
             "project_name": item.get("Project Name", None),
             "project_status": project_status,
-            "renewable_energy_technology": renewable_energy_map[
+            "renewable_energy_technology": renewable_energy_abbreviations[
                 item.get("Type/ Fuel")
             ],  # map abbreviations into readable string
             "size": item.get("SP (MW)", None),

diff --git a/api/webscraper/nyserda_scraper.py b/api/webscraper/nyserda_scraper.py
@@ -1,7 +1,7 @@
 import requests
 import json
 from utils.scraper_utils import check_status, geocode_lat_long, standardize_label
-from database_constants import renewable_energy_set, initial_kdm_dict
+from database_constants import renewable_energy_map, initial_kdm_dict
 
 """
 This scrapes data from the NYSERDA Large-scale Renewable Projects database.
@@ -28,16 +28,16 @@ def query_nyserda_large():
             )
             if (
                 check_status(item.get("project_status", None)) != "Cancelled"
-                and item.get("renewable_technology", None) in renewable_energy_set
+                and item.get("renewable_technology", None)
+                in renewable_energy_map.keys()
             ):
                 project_dict = {
                     "project_name": item.get("project_name", None),
                     "project_status": check_status(item.get("project_status", None)),
-                    "renewable_energy_technology": item.get(
-                        "renewable_technology", None
-                    ),
+                    "renewable_energy_technology": renewable_energy_map[
+                        item.get("renewable_technology")
+                    ],
                     "developer": item.get("developer_name", None),
-                    "proposed_cod": item.get("year_of_delivery_start_date", None),
                     "county": item.get("county_province", None),
                     "region": item.get("redc", None),
                     "zipcode": item.get("zip_code", None),

diff --git a/api/webscraper/utils/__pycache__/scraper_utils.cpython-312.pyc b/api/webscraper/utils/__pycache__/scraper_utils.cpython-312.pyc