Skip to content

Commit

Permalink
26 - [feat] Parse NYISO link from HTML (#43)
Browse files Browse the repository at this point in the history
* [feat] use BeautifulSoup to parse for interconnection queue xlsx link from NYISO website

* [chore] removed extra print statement

* [fix] removed proposed_cod from nyserda large-scale database filters. added W to renewable energy map

* [refactor] removed unused imports, added back date string formatting in database

* [fix] changed Land Based Wind to Land-Based Wind
  • Loading branch information
deenasun authored Oct 28, 2024
1 parent 6c021be commit d00bc73
Show file tree
Hide file tree
Showing 6 changed files with 115 additions and 92 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,6 @@ yarn-error.log*
# typescript
*.tsbuildinfo
next-env.d.ts

/ace-ny-venv
__pycache__/
3 changes: 0 additions & 3 deletions api/webscraper/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,8 @@
from utils.scraper_utils import (
create_update_object,
update_kdm,
turn_timestamp_to_string,
)
from database_constants import (
renewable_energy_map,
renewable_energy_set,
initial_kdm_dict,
)

Expand Down
72 changes: 40 additions & 32 deletions api/webscraper/database_constants.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,43 @@
renewable_energy_set = {'Hydroelectric', 'Land Based Wind', 'Offshore Wind', 'Solar', 'Geothermal', 'Energy Storage', 'Pumped Storage'}

renewable_energy_map = {
'H': 'Hydroelectric',
'S': 'Solar',
'ES': 'Energy Storage',
'PS': 'Pumped Storage',
'OSW': 'Offshore Wind',
"Hydroelectric": "Hydroelectric",
"Land Based Wind": "Land-Based Wind",
"Offshore Wind": "Offshore Wind",
"Solar": "Solar",
"Geothermal": "Geothermal",
"Energy Storage": "Energy Storage",
"Pumped Storage": "Pumped Storage",
}

renewable_energy_abbreviations = {
"H": "Hydroelectric",
"S": "Solar",
"ES": "Energy Storage",
"PS": "Pumped Storage",
"W": "Land-Based Wind",
"OSW": "Offshore Wind",
}

initial_kdm_dict = [{
'milestoneTitle': 'Entry to NYISO Queue',
'completed': False,
'date': None
},
{'milestoneTitle': 'Application for permit to ORES',
'completed': False,
'date': None
},
{'milestoneTitle': 'Issuance of permit from ORES',
'completed': False,
'date': None
},
{'milestoneTitle': 'Winning a contract award from NYSERDA',
'completed': False,
'date': None
},
{'milestoneTitle': 'Execution of an Interconnection Agreement (IA)',
'completed': False,
'date': None
},
{'milestoneTitle': 'Start of operations',
'completed': False,
'date': None
}]
initial_kdm_dict = [
{"milestoneTitle": "Entry to NYISO Queue", "completed": False, "date": None},
{
"milestoneTitle": "Application for permit to ORES",
"completed": False,
"date": None,
},
{
"milestoneTitle": "Issuance of permit from ORES",
"completed": False,
"date": None,
},
{
"milestoneTitle": "Winning a contract award from NYSERDA",
"completed": False,
"date": None,
},
{
"milestoneTitle": "Execution of an Interconnection Agreement (IA)",
"completed": False,
"date": None,
},
{"milestoneTitle": "Start of operations", "completed": False, "date": None},
]
117 changes: 66 additions & 51 deletions api/webscraper/nyiso_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,63 +4,78 @@
import json
from utils.scraper_utils import clean_df_data
from database_constants import (
renewable_energy_map,
renewable_energy_set,
initial_kdm_dict,
renewable_energy_abbreviations,
)

from bs4 import BeautifulSoup

def query_nyiso_excel():
nyiso = requests.get(
"https://www.nyiso.com/documents/20142/1407078/NYISO-Interconnection-Queue.xlsx"
)
nyiso_data = nyiso.content
all_sheets = pd.read_excel(BytesIO(nyiso_data), sheet_name=None)
return all_sheets
url = "https://www.nyiso.com/connecting-to-the-grid"
page = requests.get(url)

soup = BeautifulSoup(page.content, "html.parser")
interconnection_link = soup.find("a", string="View the Interconnection Queue")
if interconnection_link is not None:
nyiso_xlsx_href = interconnection_link.get("href")

def query_nyiso():
nyiso = requests.get(
"https://www.nyiso.com/documents/20142/1407078/NYISO-Interconnection-Queue.xlsx"
)
nyiso_data = nyiso.content
nyiso_df = pd.read_excel(BytesIO(nyiso_data))

nyiso_df = clean_df_data(nyiso_df)
nyiso_list = nyiso_df.to_dict(orient="records")
def query_nyiso_excel():
if nyiso_xlsx_href is None:
print('ERROR: "View the Interconnection Queue" link not found')
return
else:
nyiso = requests.get(nyiso_xlsx_href)
nyiso_data = nyiso.content
all_sheets = pd.read_excel(BytesIO(nyiso_data), sheet_name=None)
return all_sheets

filtered_list = []
for item in nyiso_list:
if item.get("Type/ Fuel", None) not in renewable_energy_map.keys():
continue
project_dict = {
"project_name": item.get("Project Name", None),
"project_status": "Proposed", # TODO: update this based on which sheet it's from
"renewable_energy_technology": renewable_energy_map[
item.get("Type/ Fuel")
], # map abbreviations into readable string
"size": item.get("SP (MW)", None),
"developer": item.get("Developer Name", None),
"proposed_cod": item.get(
"Proposed COD", None
), # note: non-serializable into JSON --> can't directly write to file
"county": item.get("County", None),
"region": None, # missing
"zipcode": None, # missing
"latitude": None,
"longitude": None,
# 'data_through_date': item.get('Last Updated Date', None),
"key_development_milestones": None,
"project_image": None,
"interconnection_queue_number": item.get("Queue Pos.", None),
"approved": False,
# the following fields are used for updating kdms when updating the database
"date_of_ir": item.get("Date of IR", None), # already a datetime object
"ia_tender_date": item.get("IA Tender Date", None),
}
filtered_list.append(project_dict)

return filtered_list
def query_nyiso():
if nyiso_xlsx_href is None:
print('ERROR: "View the Interconnection Queue" link not found')
return
else:
nyiso = requests.get(nyiso_xlsx_href)
nyiso_data = nyiso.content
nyiso_df = pd.read_excel(BytesIO(nyiso_data))

nyiso_df = clean_df_data(nyiso_df)
nyiso_list = nyiso_df.to_dict(orient="records")

filtered_list = []
for item in nyiso_list:
if (
item.get("Type/ Fuel", None)
not in renewable_energy_abbreviations.keys()
):
continue
project_dict = {
"project_name": item.get("Project Name", None),
"project_status": "Proposed", # TODO: update this based on which sheet it's from
"renewable_energy_technology": renewable_energy_abbreviations[
item.get("Type/ Fuel")
], # map abbreviations into readable string
"size": item.get("SP (MW)", None),
"developer": item.get("Developer Name", None),
"proposed_cod": item.get(
"Proposed COD", None
), # note: non-serializable into JSON --> can't directly write to file
"county": item.get("County", None),
"region": None, # missing
"zipcode": None, # missing
"latitude": None,
"longitude": None,
# 'data_through_date': item.get('Last Updated Date', None),
"key_development_milestones": None,
"project_image": None,
"interconnection_queue_number": item.get("Queue Pos.", None),
"approved": False,
# the following fields are used for updating kdms when updating the database
"date_of_ir": item.get("Date of IR", None), # already a datetime object
"ia_tender_date": item.get("IA Tender Date", None),
}
filtered_list.append(project_dict)

return filtered_list


def write_nyiso_to_json():
Expand All @@ -84,12 +99,12 @@ def filter_nyiso_list(project_list, sheet_name):
continue
elif sheet_name == "In Service" and item.get("State", None) != "NY":
continue
if item.get("Type/ Fuel", None) not in renewable_energy_map.keys():
if item.get("Type/ Fuel", None) not in renewable_energy_abbreviations.keys():
continue
project_dict = {
"project_name": item.get("Project Name", None),
"project_status": project_status,
"renewable_energy_technology": renewable_energy_map[
"renewable_energy_technology": renewable_energy_abbreviations[
item.get("Type/ Fuel")
], # map abbreviations into readable string
"size": item.get("SP (MW)", None),
Expand Down
12 changes: 6 additions & 6 deletions api/webscraper/nyserda_scraper.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import requests
import json
from utils.scraper_utils import check_status, geocode_lat_long, standardize_label
from database_constants import renewable_energy_set, initial_kdm_dict
from database_constants import renewable_energy_map, initial_kdm_dict

"""
This scrapes data from the NYSERDA Large-scale Renewable Projects database.
Expand All @@ -28,16 +28,16 @@ def query_nyserda_large():
)
if (
check_status(item.get("project_status", None)) != "Cancelled"
and item.get("renewable_technology", None) in renewable_energy_set
and item.get("renewable_technology", None)
in renewable_energy_map.keys()
):
project_dict = {
"project_name": item.get("project_name", None),
"project_status": check_status(item.get("project_status", None)),
"renewable_energy_technology": item.get(
"renewable_technology", None
),
"renewable_energy_technology": renewable_energy_map[
item.get("renewable_technology")
],
"developer": item.get("developer_name", None),
"proposed_cod": item.get("year_of_delivery_start_date", None),
"county": item.get("county_province", None),
"region": item.get("redc", None),
"zipcode": item.get("zip_code", None),
Expand Down
Binary file not shown.

0 comments on commit d00bc73

Please sign in to comment.