Updating data pipeline to limit fetch count and HBCUs priority list

OKN-CollabNext · Apr 25, 2024 · 5c9036a · 5c9036a
1 parent 5134285
commit 5c9036a
Show file tree

Hide file tree

Showing 6 changed files with 51 additions and 40 deletions.
diff --git a/README.md b/README.md
@@ -49,12 +49,17 @@ You need a `.env` file to store secrets and other environment variables as follo
 
 ```
 [email protected]
-INSTITUTION_FILTER=hbcus
+INSTITUTIONS_FETCH_FILTER=hbcus
+INSTITUTIONS_FETCH_COUNT=5
 ```
 
 The OPENALEX_EMAIL secret is used to [speed up calls](https://docs.openalex.org/how-to-use-the-api/api-overview) to the OpenAlex REST API.
 
-The INSTITUTION_FILTER (allowed values = `hbcus` or `howardu`) is used to configure which institutions will be fetched from the OpenAlex API and saved to `observable/docs/data/institutions.json`. You will need to delete the existing `institutions.json` file from your local to ensure that a fresh API call is made.
+INSTITUTIONS_FETCH_FILTER (allowed values = `hbcus` or `howardu`) is used to configure which institutions will be fetched from the OpenAlex API and saved to `observable/docs/data/institutions.json`.
+
+INSTITUTIONS_FETCH_COUNT determines how many institutions will be loaded in the application.
+
+>**NOTE:** INSTITUTIONS_FETCH_FILTER and INSTITUTIONS_FETCH_COUNT are only used when running `fetch_custom_institutions.py` as a script. When using `invoke fetch` the default values of `hbcus` and `5` are used respectively.
 
 ## Running
 
@@ -75,7 +80,7 @@ Deployments to this project on the Observable Cloud take place through the **Dep
 
 You can run various other commands using `invoke` as follows.
 
-Fetch HBCUs institutions data from the OpenAlex API and save it to `observable/docs/data/institutions.json`:
+Fetch first 5 HBCUs institutions data from the OpenAlex API and save it to `observable/docs/data/institutions.json`:
 
 ```bash
 invoke fetch

diff --git a/collabnext/openalex/institutions.py b/collabnext/openalex/institutions.py
@@ -1,34 +1,20 @@
 from pyalex import Institution, Institutions
 import json
-import os
-import scripts.fetch_custom_institutions as fetch_custom_institutions
+import sys
 
 
-def get_institutions(institutions_file_path: str = "observable/docs/data/institutions.json") -> list[Institution]:
+def get_institutions(institutions_file_path: str = "docs/data/institutions.json") -> list[Institution]:
     institutions = []
 
     # Load institutions from JSON file
     try:        
         institutions = json.load(open(institutions_file_path))        
     except Exception as e:
-        print("\nError loading institutions from JSON file", institutions_file_path, ":", e, "\n")
-
-    # Fetch institutions from API if JSON file is empty or not found
-    try:
-        if institutions is None or len(institutions) == 0:
-            print("No institutions found in JSON file, attempting to fetch from the API\n")
-            institutions = fetch_custom_institutions.fetch_institutions_from_api(os.getenv("INSTITUTION_FILTER"))
-    except Exception as e:
-        print("\nError fetching institutions from the API:", e, "\n")
+        print("\nError loading institutions from JSON file", institutions_file_path, ":", e, "\n", file=sys.stderr)
 
     # Get 5 random institutions in case of error
     if institutions is None or len(institutions) == 0:
-        print("No institutions found in JSON file or fetched from the API, fetching random institutions\n")
+        print("No institutions found in JSON file, fetching random institutions\n", file=sys.stderr)
         institutions = [Institutions().random() for _ in range(5)]
 
     return institutions
-
-
-if __name__ == "__main__":
-    institutions = get_institutions()
-    print("Loaded", len(institutions), "institutions\n")
diff --git a/observable/docs/data/institutions.json b/observable/docs/data/institutions.json
diff --git a/scripts/fetch_custom_institutions.py b/scripts/fetch_custom_institutions.py
@@ -7,7 +7,8 @@
 
 
 def fetch_institutions_from_api(
-        institution_filter: str,
+        institutions_fetch_filter: str = "hbcus",
+        institutions_fetch_count: int = 5,
         institutions_names_list_path: str = "scripts/hbcus_names_list.csv",
         save_to_file: bool = True,
         institutions_save_path: str = "observable/docs/data/institutions.json"
@@ -16,7 +17,8 @@ def fetch_institutions_from_api(
     Fetch institutions from the OpenAlex API based on the specified filter and save the data to a JSON file
 
     Args:
-        institution_filter (str): The filter to determine which institutions to fetch from the API
+        institutions_fetch_filter (str): The filter to determine which institutions to fetch from the API
+        institutions_fetch_count (int): The number of institutions for which to fetch data
         institutions_names_list_path (str): CSV file path containing the list of HBCUs names
         save_to_file (bool): Whether to save the institutions data to a JSON file
         institutions_save_path (str): JSON file path to save the institutions data to
@@ -28,12 +30,12 @@ def fetch_institutions_from_api(
     institutions = []
 
     try:
-        if institution_filter == "howardu":
+        if institutions_fetch_filter == "howardu":
             # Fetch Howard University based on OpenAlex ID
             institutions = Institutions().filter(openalex="I137853757").get()
             print("\nFetched institution data for Howard University")
 
-        elif institution_filter == "hbcus":
+        elif institutions_fetch_filter == "hbcus":
 
             # Read list of HBCUs Names from Eligibility Data
             inst_df = pd.read_csv(institutions_names_list_path)
@@ -42,20 +44,26 @@ def fetch_institutions_from_api(
             inst_df["query"] = inst_df["query"].str.replace(" &", "")
 
             # Run API search for each HBCU name
+            hbcu_inst_ids = []
+            hbcu_inst_count = 0
             for query in inst_df["query"].tolist():
+                # Break if the required number of institutions have been fetched, else proceed with search query
+                if hbcu_inst_count >= institutions_fetch_count:
+                    break                
                 institutions_query = Institutions().filter(display_name={"search": query}).get()
 
                 # Check search results for name matches and add to institutions list if not already present
                 for inst in institutions_query:
-                    hbcu_inst_ids = [x["id"] for x in institutions]
-                    if (inst["display_name"] in inst_df["name"].tolist()) and (inst["id"] not in hbcu_inst_ids):
+                    if (inst["display_name"] in inst_df["name"].tolist()) and (inst["id"] not in hbcu_inst_ids) and (hbcu_inst_count < institutions_fetch_count):
                         print("Adding institution:", inst["display_name"])
                         institutions.append(inst)
+                        hbcu_inst_ids.append(inst["id"])
+                        hbcu_inst_count += 1
 
             print("\nFetched data for", len(institutions), "out of", inst_df.shape[0], "institutions\n")
 
         else:
-            print("Invalid value of INSTITUTION_FILTER, make sure to set it to 'hbcus' or 'howardu' (without the quotes) in your .env file")
+            print("Invalid value of institutions_fetch_filter, make sure to set it to 'hbcus' or 'howardu' (without the quotes) in your .env file")
 
         # Save institutions data to JSON file if required
         if (save_to_file) and (len(institutions) > 0):            
@@ -70,11 +78,23 @@ def fetch_institutions_from_api(
 
 
 if __name__ == "__main__":
-
-    institution_filter = os.getenv("INSTITUTION_FILTER")
 
-    if sys.argv[1] and sys.argv[1] in ["howardu", "hbcus"]:
-        institution_filter = sys.argv[1]
+    # Check system arguments and environment variables for filter and count of institutions to fetch
+    try:
+        institutions_fetch_filter = str(sys.argv[1])
+        institutions_fetch_count = int(sys.argv[2])
+    except Exception as e:
+        print("\nError parsing system arguments:", e, "\n")
+        try:
+            institutions_fetch_filter = os.getenv("INSTITUTIONS_FETCH_FILTER")
+            institutions_fetch_count = int(os.getenv("INSTITUTIONS_FETCH_COUNT"))
+        except Exception as e:
+            print("\nError fetching environment variables:", e, "\n")
+            institutions_fetch_filter = "hbcus"
+            institutions_fetch_count = 5
+    if institutions_fetch_count is None or institutions_fetch_count <= 0:
+        institutions_fetch_count = 5
 
-    institutions = fetch_institutions_from_api(institution_filter)
+    # Make the API call to fetch data
+    institutions = fetch_institutions_from_api(institutions_fetch_filter, institutions_fetch_count)
     print("Completed fetching institutions data from the OpenAlex API\n")
diff --git a/scripts/hbcus_names_list.csv b/scripts/hbcus_names_list.csv
@@ -1,5 +1,9 @@
 name
-Alabama A & M University
+Alabama Agricultural and Mechanical University
+Fisk University
+Howard University
+Morehouse College
+Texas Southern University
 Alabama State University
 Albany State University
 Alcorn State University
@@ -26,7 +30,6 @@ Dillard University
 Edward Waters College
 Elizabeth City State University
 Fayetteville State University
-Fisk University
 Florida Agricultural and Mechanical University
 Florida Memorial University
 Fort Valley State University
@@ -36,7 +39,6 @@ H Councill Trenholm State Community College
 Hampton University
 Harris-Stowe State University
 Hinds Community College
-Howard University
 Huston-Tillotson University
 J. F. Drake State Community and Technical College
 Jackson State University
@@ -53,7 +55,6 @@ Livingstone College
 Meharry Medical College
 Miles College
 Mississippi Valley State University
-Morehouse College
 Morehouse School of Medicine
 Morgan State University
 Morris Brown College
@@ -84,7 +85,6 @@ Stillman College
 Talladega College
 Tennessee State University
 Texas College
-Texas Southern University
 Tougaloo College
 Tuskegee University
 University of Arkansas at Pine Bluff

diff --git a/tasks.py b/tasks.py
@@ -60,4 +60,4 @@ def touch(c):
 @task
 def fetch(c):
     with cwd("."):
-        c.run("python scripts/fetch_custom_institutions.py hbcus")
+        c.run("python scripts/fetch_custom_institutions.py hbcus 5")