Skip to content

Commit

Permalink
Updating data pipeline to limit fetch count and HBCUs priority list
Browse files Browse the repository at this point in the history
  • Loading branch information
whymath committed Apr 25, 2024
1 parent 5134285 commit 5c9036a
Show file tree
Hide file tree
Showing 6 changed files with 51 additions and 40 deletions.
11 changes: 8 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,17 @@ You need a `.env` file to store secrets and other environment variables as follo

```
[email protected]
INSTITUTION_FILTER=hbcus
INSTITUTIONS_FETCH_FILTER=hbcus
INSTITUTIONS_FETCH_COUNT=5
```

The OPENALEX_EMAIL secret is used to [speed up calls](https://docs.openalex.org/how-to-use-the-api/api-overview) to the OpenAlex REST API.

The INSTITUTION_FILTER (allowed values = `hbcus` or `howardu`) is used to configure which institutions will be fetched from the OpenAlex API and saved to `observable/docs/data/institutions.json`. You will need to delete the existing `institutions.json` file from your local to ensure that a fresh API call is made.
INSTITUTIONS_FETCH_FILTER (allowed values = `hbcus` or `howardu`) is used to configure which institutions will be fetched from the OpenAlex API and saved to `observable/docs/data/institutions.json`.

INSTITUTIONS_FETCH_COUNT determines how many institutions will be loaded in the application.

>**NOTE:** INSTITUTIONS_FETCH_FILTER and INSTITUTIONS_FETCH_COUNT are only used when running `fetch_custom_institutions.py` as a script. When using `invoke fetch` the default values of `hbcus` and `5` are used respectively.
## Running

Expand All @@ -75,7 +80,7 @@ Deployments to this project on the Observable Cloud take place through the **Dep

You can run various other commands using `invoke` as follows.

Fetch HBCUs institutions data from the OpenAlex API and save it to `observable/docs/data/institutions.json`:
Fetch first 5 HBCUs institutions data from the OpenAlex API and save it to `observable/docs/data/institutions.json`:

```bash
invoke fetch
Expand Down
22 changes: 4 additions & 18 deletions collabnext/openalex/institutions.py
Original file line number Diff line number Diff line change
@@ -1,34 +1,20 @@
from pyalex import Institution, Institutions
import json
import os
import scripts.fetch_custom_institutions as fetch_custom_institutions
import sys


def get_institutions(institutions_file_path: str = "observable/docs/data/institutions.json") -> list[Institution]:
def get_institutions(institutions_file_path: str = "docs/data/institutions.json") -> list[Institution]:
institutions = []

# Load institutions from JSON file
try:
institutions = json.load(open(institutions_file_path))
except Exception as e:
print("\nError loading institutions from JSON file", institutions_file_path, ":", e, "\n")

# Fetch institutions from API if JSON file is empty or not found
try:
if institutions is None or len(institutions) == 0:
print("No institutions found in JSON file, attempting to fetch from the API\n")
institutions = fetch_custom_institutions.fetch_institutions_from_api(os.getenv("INSTITUTION_FILTER"))
except Exception as e:
print("\nError fetching institutions from the API:", e, "\n")
print("\nError loading institutions from JSON file", institutions_file_path, ":", e, "\n", file=sys.stderr)

# Get 5 random institutions in case of error
if institutions is None or len(institutions) == 0:
print("No institutions found in JSON file or fetched from the API, fetching random institutions\n")
print("No institutions found in JSON file, fetching random institutions\n", file=sys.stderr)
institutions = [Institutions().random() for _ in range(5)]

return institutions


if __name__ == "__main__":
institutions = get_institutions()
print("Loaded", len(institutions), "institutions\n")
2 changes: 1 addition & 1 deletion observable/docs/data/institutions.json

Large diffs are not rendered by default.

44 changes: 32 additions & 12 deletions scripts/fetch_custom_institutions.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@


def fetch_institutions_from_api(
institution_filter: str,
institutions_fetch_filter: str = "hbcus",
institutions_fetch_count: int = 5,
institutions_names_list_path: str = "scripts/hbcus_names_list.csv",
save_to_file: bool = True,
institutions_save_path: str = "observable/docs/data/institutions.json"
Expand All @@ -16,7 +17,8 @@ def fetch_institutions_from_api(
Fetch institutions from the OpenAlex API based on the specified filter and save the data to a JSON file
Args:
institution_filter (str): The filter to determine which institutions to fetch from the API
institutions_fetch_filter (str): The filter to determine which institutions to fetch from the API
institutions_fetch_count (int): The number of institutions for which to fetch data
institutions_names_list_path (str): CSV file path containing the list of HBCUs names
save_to_file (bool): Whether to save the institutions data to a JSON file
institutions_save_path (str): JSON file path to save the institutions data to
Expand All @@ -28,12 +30,12 @@ def fetch_institutions_from_api(
institutions = []

try:
if institution_filter == "howardu":
if institutions_fetch_filter == "howardu":
# Fetch Howard University based on OpenAlex ID
institutions = Institutions().filter(openalex="I137853757").get()
print("\nFetched institution data for Howard University")

elif institution_filter == "hbcus":
elif institutions_fetch_filter == "hbcus":

# Read list of HBCUs Names from Eligibility Data
inst_df = pd.read_csv(institutions_names_list_path)
Expand All @@ -42,20 +44,26 @@ def fetch_institutions_from_api(
inst_df["query"] = inst_df["query"].str.replace(" &", "")

# Run API search for each HBCU name
hbcu_inst_ids = []
hbcu_inst_count = 0
for query in inst_df["query"].tolist():
# Break if the required number of institutions have been fetched, else proceed with search query
if hbcu_inst_count >= institutions_fetch_count:
break
institutions_query = Institutions().filter(display_name={"search": query}).get()

# Check search results for name matches and add to institutions list if not already present
for inst in institutions_query:
hbcu_inst_ids = [x["id"] for x in institutions]
if (inst["display_name"] in inst_df["name"].tolist()) and (inst["id"] not in hbcu_inst_ids):
if (inst["display_name"] in inst_df["name"].tolist()) and (inst["id"] not in hbcu_inst_ids) and (hbcu_inst_count < institutions_fetch_count):
print("Adding institution:", inst["display_name"])
institutions.append(inst)
hbcu_inst_ids.append(inst["id"])
hbcu_inst_count += 1

print("\nFetched data for", len(institutions), "out of", inst_df.shape[0], "institutions\n")

else:
print("Invalid value of INSTITUTION_FILTER, make sure to set it to 'hbcus' or 'howardu' (without the quotes) in your .env file")
print("Invalid value of institutions_fetch_filter, make sure to set it to 'hbcus' or 'howardu' (without the quotes) in your .env file")

# Save institutions data to JSON file if required
if (save_to_file) and (len(institutions) > 0):
Expand All @@ -70,11 +78,23 @@ def fetch_institutions_from_api(


if __name__ == "__main__":

institution_filter = os.getenv("INSTITUTION_FILTER")

if sys.argv[1] and sys.argv[1] in ["howardu", "hbcus"]:
institution_filter = sys.argv[1]
# Check system arguments and environment variables for filter and count of institutions to fetch
try:
institutions_fetch_filter = str(sys.argv[1])
institutions_fetch_count = int(sys.argv[2])
except Exception as e:
print("\nError parsing system arguments:", e, "\n")
try:
institutions_fetch_filter = os.getenv("INSTITUTIONS_FETCH_FILTER")
institutions_fetch_count = int(os.getenv("INSTITUTIONS_FETCH_COUNT"))
except Exception as e:
print("\nError fetching environment variables:", e, "\n")
institutions_fetch_filter = "hbcus"
institutions_fetch_count = 5
if institutions_fetch_count is None or institutions_fetch_count <= 0:
institutions_fetch_count = 5

institutions = fetch_institutions_from_api(institution_filter)
# Make the API call to fetch data
institutions = fetch_institutions_from_api(institutions_fetch_filter, institutions_fetch_count)
print("Completed fetching institutions data from the OpenAlex API\n")
10 changes: 5 additions & 5 deletions scripts/hbcus_names_list.csv
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
name
Alabama A & M University
Alabama Agricultural and Mechanical University
Fisk University
Howard University
Morehouse College
Texas Southern University
Alabama State University
Albany State University
Alcorn State University
Expand All @@ -26,7 +30,6 @@ Dillard University
Edward Waters College
Elizabeth City State University
Fayetteville State University
Fisk University
Florida Agricultural and Mechanical University
Florida Memorial University
Fort Valley State University
Expand All @@ -36,7 +39,6 @@ H Councill Trenholm State Community College
Hampton University
Harris-Stowe State University
Hinds Community College
Howard University
Huston-Tillotson University
J. F. Drake State Community and Technical College
Jackson State University
Expand All @@ -53,7 +55,6 @@ Livingstone College
Meharry Medical College
Miles College
Mississippi Valley State University
Morehouse College
Morehouse School of Medicine
Morgan State University
Morris Brown College
Expand Down Expand Up @@ -84,7 +85,6 @@ Stillman College
Talladega College
Tennessee State University
Texas College
Texas Southern University
Tougaloo College
Tuskegee University
University of Arkansas at Pine Bluff
Expand Down
2 changes: 1 addition & 1 deletion tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,4 +60,4 @@ def touch(c):
@task
def fetch(c):
with cwd("."):
c.run("python scripts/fetch_custom_institutions.py hbcus")
c.run("python scripts/fetch_custom_institutions.py hbcus 5")

0 comments on commit 5c9036a

Please sign in to comment.