Skip to content

Commit

Permalink
68 - [feat] Check for Project Updates (#77)
Browse files Browse the repository at this point in the history
* [feat] scrape projects from ORES website and insert/update Supabase

* [chore] commented out tests

* [chore] small changes - solar pv, nyserda contract kdm completed/date, tendering of interconnection agreement kdm rename

* [feat] updates for nyserda webscraper, also moved reverse geocoding of small solar projects to database.py

* [feat] moved geocoding and kdm updating for ores to database, wrote function to delete nyiso withdrawn functions, include last_updated field for nyiso

* [chore] temporarily commented out function to check/delete withdrawn nyiso projects

* [fix] fix for datestring representation for Winning a contract award from NYSERDA kdm

* [refactor] update NYSERDA large scale to track last_updated as a dict

* [feat] updated last_updated dict type for all datasources, added function descriptions for database and scraper_utils

* [chore] commenting out functions to delete cancelled and withdrawn projects for now
  • Loading branch information
deenasun authored Nov 21, 2024
1 parent da9b5dd commit cfbcef6
Show file tree
Hide file tree
Showing 9 changed files with 673 additions and 409 deletions.
Binary file modified api/webscraper/__pycache__/database_constants.cpython-312.pyc
Binary file not shown.
Binary file modified api/webscraper/__pycache__/nyiso_scraper.cpython-312.pyc
Binary file not shown.
Binary file modified api/webscraper/__pycache__/nyserda_scraper.cpython-312.pyc
Binary file not shown.
793 changes: 486 additions & 307 deletions api/webscraper/database.py

Large diffs are not rendered by default.

44 changes: 30 additions & 14 deletions api/webscraper/database_constants.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,25 @@
# maps renewable energy strings to our app's renewable energy names
renewable_energy_map = {
"Hydroelectric": "Hydroelectric",
"Land Based Wind": "Land-Based Wind",
"Offshore Wind": "Offshore Wind",
"Solar": "Solar",
"Solar": "Solar PV",
"Geothermal": "Geothermal",
"Energy Storage": "Energy Storage",
"Pumped Storage": "Pumped Storage",
}

# maps NYISO abbreviations to renewable energy strings
renewable_energy_abbreviations = {
"H": "Hydroelectric",
"S": "Solar",
"S": "Solar PV",
"ES": "Energy Storage",
"PS": "Pumped Storage",
"W": "Land-Based Wind",
"OSW": "Offshore Wind",
}

initial_kdm_dict = [
initial_kdm = [
{"milestoneTitle": "Entry to NYISO Queue", "completed": False, "date": None},
{
"milestoneTitle": "Application for permit to ORES",
Expand All @@ -35,19 +37,33 @@
"date": None,
},
{
"milestoneTitle": "Execution of an Interconnection Agreement (IA)",
"milestoneTitle": "Tendering of an Interconnection Agreement (IA)",
"completed": False,
"date": None,
},
{"milestoneTitle": "Start of operations", "completed": False, "date": None},
{
"milestoneTitle": "Application for permit to ORES",
"completed": False,
"date": None,
},
{
"milestoneTitle": "Issuance of permit from ORES",
"completed": False,
"date": None,
},
]

project_fields = [
"project_name",
"renewable_energy_technology",
"project_status",
"developer",
"town",
"county",
"region",
"size",
"latitude",
"longitude",
"key_development_milestones",
"project_image",
"interconnection_queue_number",
"approved",
"state_senate_district",
"assembly_district",
"zipcode",
"proposed_cod",
"permit_process",
"permit_application_number",
"last_updated",
]
54 changes: 49 additions & 5 deletions api/webscraper/nyiso_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,11 @@ def query_nyiso_excel():


def query_nyiso():
"""
Queries for all the projects in the NYISO sheet and filters
Outdated - does not modify behavior based on which sheet the project is from (Interconnection Queue, Cluster Projects, In Service)
returns: list of dictionaries representing the projects
"""
if nyiso_xlsx_href is None:
print('ERROR: "View the Interconnection Queue" link not found')
return
Expand All @@ -50,7 +55,7 @@ def query_nyiso():
continue
project_dict = {
"project_name": item.get("Project Name", None),
"project_status": "Proposed", # TODO: update this based on which sheet it's from
"project_status": "Proposed",
"renewable_energy_technology": renewable_energy_abbreviations[
item.get("Type/ Fuel")
], # map abbreviations into readable string
Expand Down Expand Up @@ -95,7 +100,9 @@ def filter_nyiso_list(project_list, sheet_name):
for item in project_list:
if sheet_name == "Interconnection Queue" and item.get("State") != "NY":
continue
elif sheet_name == "Cluster Projects" and item.get("State", None) != "New York":
elif sheet_name == "Cluster Projects" and not (
item.get("State", None) == "New York" or item.get("State", None) == "NY"
):
continue
elif sheet_name == "In Service" and item.get("State", None) != "NY":
continue
Expand All @@ -111,13 +118,22 @@ def filter_nyiso_list(project_list, sheet_name):
"developer": item.get("Developer Name", None),
"proposed_cod": item.get(
"Proposed COD", None
), # note: non-serializable into JSON --> can't directly write to file
), # NOTE: non-serializable into JSON --> can't directly write to file
"county": item.get("County", None),
"region": None, # missing
"zipcode": None, # missing
"latitude": None,
"longitude": None,
# 'data_through_date': item.get('Last Updated Date', None),
"nyiso_last_updated": (
item.get("Last Updated Date", None) # NOTE: non-serializable into JSON
if (
sheet_name == "Interconnection Queue"
or sheet_name == "Cluster Projects"
)
else item.get(
"Last Update NaT", None
) # NOTE: the column header for the in-service sheet is called "Last Update NaT"
),
"key_development_milestones": None,
"project_image": None,
"interconnection_queue_number": item.get("Queue Pos.", None),
Expand Down Expand Up @@ -155,7 +171,7 @@ def filter_nyiso_cluster_sheet():
cluster_projects_df = clean_df_data(cluster_projects_df)
cluster_projects_list = cluster_projects_df.to_dict(orient="records")

filtered_list = filter_nyiso_list(cluster_projects_list, "Cluster Project")
filtered_list = filter_nyiso_list(cluster_projects_list, "Cluster Projects")
return filtered_list


Expand Down Expand Up @@ -184,9 +200,37 @@ def filter_nyiso_in_service_sheet():
return filtered_list


def filter_nyiso_withdrawn_sheets():
"""
Returns a list of objects containing the key: "project_name" of withdrawn projects
"""
all_sheets = query_nyiso_excel()
sheet_names = list(all_sheets.keys())
withdrawn_key = sheet_names[2] # gets the sheet named "Withdrawn"
cluster_withdrawn_key = sheet_names[3] # gets the sheet named "Cluster Withdrawn"

withdrawn_df = all_sheets[withdrawn_key]
withdrawn_df = clean_df_data(withdrawn_df)
withdrawn_list = withdrawn_df.to_dict(orient="records")

cluster_withdrawn_df = all_sheets[cluster_withdrawn_key]
cluster_withdrawn_df = clean_df_data(cluster_withdrawn_df)
cluster_withdrawn_list = cluster_withdrawn_df.to_dict(orient="records")

withdrawn_list = withdrawn_list + cluster_withdrawn_list
filtered_list = [
{"project_name": item.get("Project Name", None)}
for item in withdrawn_list
if item.get("Project Name", None) is not None
]
return filtered_list


"""
For testing
"""
# write_nyiso_to_json()
# print(filter_nyiso_iq_sheet())
# print(filter_nyiso_in_service_sheet())
# print(filter_nyiso_cluster_sheet())
# print(filter_nyiso_withdrawn_sheets())
78 changes: 42 additions & 36 deletions api/webscraper/nyserda_scraper.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import requests
import json
from utils.scraper_utils import check_status, geocode_lat_long, standardize_label
from database_constants import renewable_energy_map, initial_kdm_dict
from database_constants import renewable_energy_map, initial_kdm

"""
This scrapes data from the NYSERDA Large-scale Renewable Projects database.
Expand All @@ -10,6 +10,17 @@
"""


def solicitation_name_to_date(solicitation_name):
if solicitation_name is None:
return None
if "-" not in solicitation_name:
return None
else:
parts = solicitation_name.split("-")
year = parts[0][-2::]
return f"20{year}-01-01"


def query_nyserda_large():
nyserda_large_response = requests.get("https://data.ny.gov/resource/dprp-55ye.json")
if nyserda_large_response.status_code != 200:
Expand All @@ -26,11 +37,7 @@ def query_nyserda_large():
if item.get("renewable_technology", None) is not None
else None
)
if (
check_status(item.get("project_status", None)) != "Cancelled"
and item.get("renewable_technology", None)
in renewable_energy_map.keys()
):
if item.get("renewable_technology", None) in renewable_energy_map.keys():
project_dict = {
"project_name": item.get("project_name", None),
"project_status": check_status(item.get("project_status", None)),
Expand All @@ -51,17 +58,19 @@ def query_nyserda_large():
if item.get("georeference", None) is not None
else None
),
# 'data_through_date': item.get('data_through_date', None),
"data_through_date": item.get("data_through_date").split("T")[0],
"permit_process": item.get("permit_process", None),
"interconnection_queue_number": item.get(
"interconnection_queue_number", None
),
"key_development_milestones": initial_kdm_dict,
"size": item.get("new_renewable_capacity_mw", None),
"key_development_milestones": initial_kdm,
"project_image": None,
"approved": False,
"proposed_cod": item.get("year_of_delivery_start_date", None),
# used for updating the kdms
"year_of_delivery_start_date": item.get(
"year_of_delivery_start_date", None
"nyserda_contract_date": solicitation_name_to_date(
item.get("solicitation_name", None)
),
}
filtered_list.append(project_dict)
Expand All @@ -76,17 +85,15 @@ def write_large_to_json():
file.write("\n")


"""
This scrapes data from the NYSERDA Statewide Distributed Solar Projects database.
We filter for specific columns from the database's API and save them to a json file.
https://data.ny.gov/Energy-Environment/Statewide-Distributed-Solar-Projects-Beginning-200/wgsj-jt5f/about_data
geocode_lat_long is a helper util function that uses the google maps geocoding api to get the estimated
latitude and longitude of a project based on the town
"""


def query_nyserda_solar(offset=0, limit=1000):
"""
This scrapes data from the NYSERDA Statewide Distributed Solar Projects database.
We filter for specific columns from the database's API and save them to a json file.
https://data.ny.gov/Energy-Environment/Statewide-Distributed-Solar-Projects-Beginning-200/wgsj-jt5f/about_data
geocode_lat_long is a helper util function that uses the google maps geocoding api to get the estimated
latitude and longitude of a project based on the town
"""
nyserda_small_response = requests.get(
f"https://data.ny.gov/resource/wgsj-jt5f.json?$limit={limit}&$offset={offset}"
)
Expand All @@ -108,45 +115,44 @@ def query_nyserda_solar(offset=0, limit=1000):

if size_in_mw is None or size_in_mw < 2:
continue
if (
item.get("project_id", None) is None
): # some projects have no project_id, so we skip them
continue

if check_status(item.get("project_status", None)) != "Cancelled":
if item.get("city_town", None) is not None:
lat, long = geocode_lat_long(f"{item.get('city_town')}, NY")
else:
lat, long = None, None
project_dict = {
"project_name": item.get(
"project_id", None
), # small data set only has project_id
"project_status": check_status(
item.get("project_status", None)
), # missing
), # NYSERDA small-scale solar projects do not have a project status
"renewable_energy_technology": "Solar",
"size": size_in_mw,
"developer": item.get("developer", None),
"proposed_cod": item.get("interconnection_date", None),
"town": item.get("city_town", None),
"county": item.get("county", None),
"region": item.get("redc", None), # missing
"zipcode": item.get("zip", None),
"latitude": lat,
"longitude": long,
# 'data_through_date': item.get('data_through_date', None),
"key_development_milestones": initial_kdm_dict,
"latitude": None,
"longitude": None,
"data_through_date": item.get("data_through_date").split("T")[0],
"key_development_milestones": initial_kdm,
"project_image": None,
"approved": False,
}
filtered_list.append(project_dict)
return filtered_list


"""
The NYSERDA Statewide Distributed Solar Projects database has 230,000 records
However, the API has a default limit of 1,000 rows.
This function repeatedly queries the API with different offsets to get all the records.
"""


def query_nyserda_solar_repeat():
"""
The NYSERDA Statewide Distributed Solar Projects database has 230,000 records
However, the API has a default limit of 1,000 rows.
This function repeatedly queries the API with different offsets to get all the records.
"""
# TODO: get the total number of records from the database by HTML parsing
length = 250000
limit = 1000
Expand Down
Loading

0 comments on commit cfbcef6

Please sign in to comment.