OKN-CollabNext · kaaloo · Apr 25, 2024 · Apr 24, 2024 · Apr 24, 2024 · Apr 25, 2024
diff --git a/README.md b/README.md
@@ -14,16 +14,16 @@ pyenv install 3.11.4
 
 ### Node
 
-This code base is compatible with node 18 and above. Please use [the following instructions](https://nodejs.org/en/learn/getting-started/how-to-install-nodejs)
+This code base is compatible with node 18 and above. Please use [these instructions](https://nodejs.org/en/learn/getting-started/how-to-install-nodejs)
 to install node for your operating system if needed.
 
 ### Git
 
-Please [follow the instructions on GitHub](https://github.com/git-guides/install-git) to install git on your system.
+Please follow the [instructions on GitHub](https://github.com/git-guides/install-git) to install git on your system.
 
 ### Poetry
 
-Please [follow the instractions on the Poetry website](https://python-poetry.org/docs/#installation) to install poetry on your system.
+Please follow the [instructions on the Poetry website](https://python-poetry.org/docs/#installation) to install poetry on your system.
 
 ## Getting Started
 
@@ -45,14 +45,17 @@ You can then install project dependencies as follows:
 poetry install
 ```
 
-You need a `.env` file to store secrets as follows:
+You need a `.env` file to store secrets and other environment variables as follows:
 
 ```
 [email protected]
+INSTITUTION_FILTER=hbcus
 ```
 
 The OPENALEX_EMAIL secret is used to [speed up calls](https://docs.openalex.org/how-to-use-the-api/api-overview) to the OpenAlex REST API.
 
+The INSTITUTION_FILTER (allowed values = `hbcus` or `howardu`) is used to configure which institutions will be fetched from the OpenAlex API and saved to `observable/docs/data/institutions.json`. You will need to delete the existing `institutions.json` file from your local to ensure that a fresh API call is made.
+
 ## Running
 
 This project uses [Observable Framework](https://observablehq.com/framework/). You can run the site locally in development mode as follows
@@ -72,13 +75,19 @@ Deployments to this project on the Observable Cloud take place through the **Dep
 
 You can run various other commands using `invoke` as follows.
 
-Deploy the site to Observable Cloud.
+Fetch HBCUs institutions data from the OpenAlex API and save it to `observable/docs/data/institutions.json`:
+
+```bash
+invoke fetch
+```
+
+Deploy the site to Observable Cloud:
 
 ```bash
 invoke deploy
 ```
 
-Build the static web site locally.
+Build the static web site locally:
 
 ```bash
 invoke build
@@ -87,13 +96,13 @@ invoke build
 Manually case a graph.json refresh. This is needed because currently
 observable framework doesn't notice if a dependent python module
 has been changed when developing. It only monitors changes to
-the particular page that is being displayed.
+the particular page that is being displayed.:
 
 ```bash
 invoke touch
 ```
 
-Delete local git branches that have already been merged.
+Delete local git branches that have already been merged:
 
 ```bash
 invoke clean-branches

diff --git a/collabnext/custom.py b/collabnext/custom.py
diff --git a/collabnext/openalex/institutions.py b/collabnext/openalex/institutions.py
@@ -1,6 +1,34 @@
 from pyalex import Institution, Institutions
+import json
+import os
+import scripts.fetch_custom_institutions as fetch_custom_institutions
 
 
-def get_institutions() -> list[Institution]:
-    # Get 5 random institutions for now
-    return [Institutions().random() for _ in range(5)]
+def get_institutions(institutions_file_path: str = "observable/docs/data/institutions.json") -> list[Institution]:
+    institutions = []
+
+    # Load institutions from JSON file
+    try:        
+        institutions = json.load(open(institutions_file_path))        
+    except Exception as e:
+        print("\nError loading institutions from JSON file", institutions_file_path, ":", e, "\n")
+
+    # Fetch institutions from API if JSON file is empty or not found
+    try:
+        if institutions is None or len(institutions) == 0:
+            print("No institutions found in JSON file, attempting to fetch from the API\n")
+            institutions = fetch_custom_institutions.fetch_institutions_from_api(os.getenv("INSTITUTION_FILTER"))
+    except Exception as e:
+        print("\nError fetching institutions from the API:", e, "\n")
+
+    # Get 5 random institutions in case of error
+    if institutions is None or len(institutions) == 0:
+        print("No institutions found in JSON file or fetched from the API, fetching random institutions\n")
+        institutions = [Institutions().random() for _ in range(5)]
+
+    return institutions
+
+
+if __name__ == "__main__":
+    institutions = get_institutions()
+    print("Loaded", len(institutions), "institutions\n")
diff --git a/collabnext/settings.py b/collabnext/settings.py
diff --git a/data/institutions_hbcus.json b/data/institutions_hbcus.json
diff --git a/data/institutions_howardu.json b/data/institutions_howardu.json
diff --git a/observable/docs/data/graph.sqlite.py b/observable/docs/data/graph.sqlite.py
@@ -23,20 +23,8 @@
 from collabnext.openalex.topics import get_work_topics
 from collabnext.openalex.works import get_works_by_authors
 
-from collabnext import settings, custom
-
 # Get institutions
-institutions = []
-try:
-    if settings.INSTITUTION_FILTER == "howardu":
-        institutions = custom.get_institutions_howardu()
-    elif settings.INSTITUTION_FILTER == "hbcus":
-        institutions = custom.get_institutions_hbcus(settings.DATA_LOAD_TYPE)
-except Exception as e:
-    print("\nError getting custom institutions:", e, "\n")
-
-if institutions is None or len(institutions) == 0:
-    institutions = get_institutions()
+institutions = get_institutions()
 
 # Create nodes
 institution_nodes = make_institution_nodes(institutions)

diff --git a/observable/docs/data/institutions.json b/observable/docs/data/institutions.json
diff --git a/requirements.txt b/requirements.txt
diff --git a/scripts/fetch_custom_institutions.py b/scripts/fetch_custom_institutions.py
@@ -0,0 +1,80 @@
+
+from pyalex import Institutions, Institution
+import pandas as pd
+import json
+import os
+import sys
+
+
+def fetch_institutions_from_api(
+        institution_filter: str,
+        institutions_names_list_path: str = "scripts/hbcus_names_list.csv",
+        save_to_file: bool = True,
+        institutions_save_path: str = "observable/docs/data/institutions.json"
+) -> list[Institution]:
+    """
+    Fetch institutions from the OpenAlex API based on the specified filter and save the data to a JSON file
+
+    Args:
+        institution_filter (str): The filter to determine which institutions to fetch from the API
+        institutions_names_list_path (str): CSV file path containing the list of HBCUs names
+        save_to_file (bool): Whether to save the institutions data to a JSON file
+        institutions_save_path (str): JSON file path to save the institutions data to
+
+    Returns:
+        list[Institution]: The list of institutions fetched from the API
+    """
+
+    institutions = []
+
+    try:
+        if institution_filter == "howardu":
+            # Fetch Howard University based on OpenAlex ID
+            institutions = Institutions().filter(openalex="I137853757").get()
+            print("\nFetched institution data for Howard University")
+
+        elif institution_filter == "hbcus":
+
+            # Read list of HBCUs Names from Eligibility Data
+            inst_df = pd.read_csv(institutions_names_list_path)
+            print("\nLoaded list of HBCUs names from:", institutions_names_list_path, "\n")
+            inst_df["query"] = inst_df["name"].str.lower()
+            inst_df["query"] = inst_df["query"].str.replace(" &", "")
+
+            # Run API search for each HBCU name
+            for query in inst_df["query"].tolist():
+                institutions_query = Institutions().filter(display_name={"search": query}).get()
+
+                # Check search results for name matches and add to institutions list if not already present
+                for inst in institutions_query:
+                    hbcu_inst_ids = [x["id"] for x in institutions]
+                    if (inst["display_name"] in inst_df["name"].tolist()) and (inst["id"] not in hbcu_inst_ids):
+                        print("Adding institution:", inst["display_name"])
+                        institutions.append(inst)
+
+            print("\nFetched data for", len(institutions), "out of", inst_df.shape[0], "institutions\n")
+
+        else:
+            print("Invalid value of INSTITUTION_FILTER, make sure to set it to 'hbcus' or 'howardu' (without the quotes) in your .env file")
+
+        # Save institutions data to JSON file if required
+        if (save_to_file) and (len(institutions) > 0):            
+            with open(institutions_save_path, "w") as f:
+                json.dump(institutions, f)
+                print("Institutions data saved to", institutions_save_path, "\n")
+
+    except Exception as e:
+        print("\nError fetching institutions from the API:", e, "\n")
+
+    return institutions
+
+
+if __name__ == "__main__":
+
+    institution_filter = os.getenv("INSTITUTION_FILTER")
+
+    if sys.argv[1] and sys.argv[1] in ["howardu", "hbcus"]:
+        institution_filter = sys.argv[1]
+
+    institutions = fetch_institutions_from_api(institution_filter)
+    print("Completed fetching institutions data from the OpenAlex API\n")
diff --git a/data/institutions_hbcus.csv → scripts/hbcus_names_list.csv b/data/institutions_hbcus.csv → scripts/hbcus_names_list.csv
diff --git a/tasks.py b/tasks.py
@@ -55,3 +55,9 @@ def clean_branches(c):
 def touch(c):
     with cwd("observable/docs/data"):
         c.run("touch graph.sqlite.py")
+
+
+@task
+def fetch(c):
+    with cwd("."):
+        c.run("python scripts/fetch_custom_institutions.py hbcus")