Revamped README and ipynb. Minor modifications to pipelines to suppor…

…t hydra_search.
kevon217 · May 17, 2023 · 43c9131 · 43c9131
1 parent 4bad23f
commit 43c9131
Show file tree

Hide file tree

Showing 20 changed files with 2,197 additions and 531 deletions.
diff --git a/README.md b/README.md
@@ -1,6 +1,18 @@
 # data-dictionary-cui-mapping
 
-This package allows you to load in a data dictionary and semi-automatically query appropriate UMLS concepts using either the UMLS API, MetaMap API, and/or Semantic Search through a custom Pinecone vector database .
+This package assists with mapping a user's data dictionary fields to [UMLS](https://www.nlm.nih.gov/research/umls/index.html) concepts. It is designed to be modular and flexible to allow for different configurations and use cases.
+
+Roughly, the high-level steps are as follows:
+- Configure yaml files
+- Load in data dictionary
+- Preprocess desired columns
+- Query for UMLS concepts using any or all of the following pipeline modules:
+  - **umls** (*UMLS API*)
+  - **metamap** (*MetaMap API*)
+  - **semantic_search** (*relies on access to a custom Pinecone vector database*)
+  - **hydra_search** (*combines any combination of the above three modules*)
+- Manually curate/select concepts in excel
+- Create data dictionary file with new UMLS concept fields
 
 ## Prerequisites
 
@@ -9,7 +21,7 @@ This package allows you to load in a data dictionary and semi-automatically quer
 
 ## Installation
 
-Use the package manager [pip](https://pip.pypa.io/en/stable/) to install data-dictionary-cui-mapping or pip install from the GitHub repo.
+Use the package manager [pip](https://pip.pypa.io/en/stable/) to install [data-dictionary-cui-mapping](https://pypi.org/project/data-dictionary-cui-mapping/) from PyPI or pip install from the [GitHub repo](https://github.com/kevon217/data-dictionary-cui-mapping). The project uses [poetry](https://python-poetry.org/) for packaging and dependency management.
 
 ```bash
 pip install data-dictionary-cui-mapping
@@ -18,7 +30,7 @@ pip install data-dictionary-cui-mapping
 
 ## Input: Data Dictionary
 
-Below is a sample data dictionary format that can be used as input for this package.
+Below is a sample data dictionary format (*.csv*) that can be used as input for this package:
 
 | variable name | title                  | permissible value descriptions |
 | ------------- | ---------------------- |--------------------------------|
@@ -51,60 +63,93 @@ In order to run and customize these pipelines, you will need to create/edit yaml
 │   │   │       embeddings.yaml
 ```
 
-## UMLS API and MetaMap Batch Queries
+## CUI Batch Query Pipelines
 
-#### Import modules
+
+### STEP-1A: RUN BATCH QUERY PIPELINE
+###### IMPORT PACKAGES
 
 ```python
-# import batch_query_pipeline modules from metamap OR umls package
-from ddcuimap.metamap import batch_query_pipeline as mm_bqp
-from ddcuimap.umls import batch_query_pipeline as umls_bqp
+# from ddcuimap.umls import batch_query_pipeline as umls_bqp
+# from ddcuimap.metamap import batch_query_pipeline as mm_bqp
+# from ddcuimap.semantic_search import batch_hybrid_query_pipeline as ss_bqp
+from ddcuimap.hydra_search import batch_hydra_query_pipeline as hs_bqp
 
-# import helper functions for loading, viewing, composing configurations for pipeline run
 from ddcuimap.utils import helper
 from omegaconf import OmegaConf
-
-# import modules to create data dictionary with curated CUIs and check the file for missing mappings
-from ddcuimap.curation import create_dictionary_import_file
-from ddcuimap.curation import check_cuis
 ```
-#### Load/edit configuration files
+###### LOAD/EDIT CONFIGURATION FILES
 ```python
-cfg = helper.compose_config.fn(overrides=["custom=de", "apis=config_metamap_api"]) # custom config for MetaMap on data element 'title' column
-# cfg = helper.compose_config.fn(overrides=["custom=de", "apis=config_umls_api"]) # custom config for UMLS API on data element 'title' column
-# cfg = helper.compose_config.fn(overrides=["custom=pvd", "apis=config_metamap_api"]) # custom config for MetaMap on 'permissible value descriptions' column
-# cfg = helper.compose_config.fn(overrides=["custom=pvd", "apis=config_umls_api"]) # custom config for UMLS API on 'permissible value descriptions' column
-cfg.apis.user_info.email = '' # enter your email
-cfg.apis.user_info.apiKey = '' # enter your api key
-print(OmegaConf.to_yaml(cfg))
+cfg_hydra = helper.compose_config.fn(overrides=["custom=hydra_base"])
+# cfg_umls = helper.compose_config.fn(overrides=["custom=de", "apis=config_umls_api"])
+cfg_mm = helper.compose_config.fn(overrides=["custom=de", "apis=config_metamap_api"])
+cfg_ss = helper.compose_config.fn(
+    overrides=[
+        "custom=title_def",
+        "semantic_search=embeddings",
+        "apis=config_pinecone_api",
+    ]
+)
+
+# # UMLS API CREDENTIALS
+# cfg_umls.apis.umls.user_info.apiKey = ''
+# cfg_umls.apis.umls.user_info.email = ''
+
+# # MetaMap API CREDENTIALS
+# cfg_mm.apis.metamap.user_info.apiKey = ''
+# cfg_mm.apis.metamap.user_info.email = ''
+#
+# # Pinecone API CREDENTIALS
+# cfg_ss.apis.pinecone.index_info.apiKey = ''
+# cfg_ss.apis.pinecone.index_info.environment = ''
+
+print(OmegaConf.to_yaml(cfg_hydra))
 ```
 
-#### Step 1: Run batch query pipeline
+###### RUN BATCH QUERY PIPELINE
 ```python
-df_final_mm = mm_bqp.run_mm_batch(cfg) # run MetaMap batch query pipeline
-# df_final_umls = umls_bqp.run_umls_batch(cfg) # run UMLS API batch query pipeline
+# df_umls, cfg_umls = umls_bqp.run_umls_batch(cfg_umls)
+# df_mm, cfg_mm = mm_bqp.run_mm_batch(cfg_mm)
+# df_ss, cfg_ss = ss_bqp.run_hybrid_ss_batch(cfg_ss)
+df_hydra, cfg_step1 = hs_bqp.run_hydra_batch(cfg_hydra, cfg_umls=None, cfg_mm=cfg_mm, cfg_ss=cfg_ss)
+
+print(df_hydra.head())
 ```
 
-#### Step 2: *Manual curation step in excel file
+### STEP-1B: **MANUAL CURATION STEP IN EXCEL*
 
+###### CURATION/SELECTION
 *see curation example in ***notebooks/examples_files/DE_Step-1_curation_keepCol.xlsx***
 
-#### Step 3: Create data dictionary import file
+### STEP-2A: CREATE DATA DICTIONARY IMPORT FILE
+
+###### IMPORT CURATION MODULES
+```python
+from ddcuimap.curation import create_dictionary_import_file
+from ddcuimap.curation import check_cuis
+from ddcuimap.utils import helper
+```
+###### CREATE DATA DICTIONARY IMPORT FILE
 
 ```python
-cfg = helper.load_config.fn(helper.choose_file.fn("Load config file from Step 1"))
-create_dictionary_import_file.create_dd_file(cfg)
+cfg_step1 = helper.load_config.fn(helper.choose_file("Load config file from Step 1"))
+df_dd = create_dictionary_import_file.create_dd_file(cfg_step1)
+print(df_dd.head())
 ```
 
-#### Step 4: Check curated cui mappings
+### STEP-2B: CHECK CUIS IN DATA DICTIONARY IMPORT FILE
 
+###### CHECK CUIS
 ```python
-cfg = helper.load_config.fn(helper.choose_file.fn("Load config file from Step 2"))
-check_cuis.check_cuis(cfg)
+cfg_step2 = helper.load_config.fn(helper.choose_file("Load config file from Step 2"))
+df_check = check_cuis.check_cuis(cfg_step2)
+print(df_check.head())
 ```
 
 ## Output: Data Dictionary + CUIs
-Below is the final output of the data dictionary with curated CUIs.
+Below is a sample modified data dictionary with curated CUIs after:
+1. Running Steps 1-2 on **title** then taking the generated output dictionary file and;
+2. Running Steps 1-2 again on **permissible value descriptions** to get the final output dictionary file.
 
 | variable name | title                  | data element concept identifiers | data element concept names | data element terminology sources | permissible values   | permissible value descriptions | permissible value output codes | permissible value concept identifiers | permissible value concept names           | permissible value terminology sources |
 | ------------- | ---------------------- | -------------------------------- | -------------------------- | -------------------------------- | -------------------- | ------------------------------ | ------------------------------ | ------------------------------------- | ----------------------------------------- | ------------------------------------- |

diff --git a/ddcuimap/configs/config.yaml b/ddcuimap/configs/config.yaml
@@ -4,7 +4,5 @@ defaults:
     - config_umls_api
     - config_metamap_api
     - config_pinecone_api
-  - custom:
-    - de
-    - title_def
+  - custom: null
   - semantic_search: null
diff --git a/ddcuimap/configs/custom/de.yaml b/ddcuimap/configs/custom/de.yaml
@@ -13,7 +13,7 @@ data_dictionary_settings:
 
 preprocessing_settings:
     remove_stopwords : true
-    stopwords_filepath: 'C:\\Users\\armengolkm\\Desktop\\Full Pipeline Test v1.1.0\\MetaMap_Settings_StopWords.csv'
+    stopwords_filepath:
     use_cheatsheet : false
     cheatsheet_filepath:
 

diff --git a/ddcuimap/configs/custom/hydra_base.yaml b/ddcuimap/configs/custom/hydra_base.yaml
@@ -14,7 +14,7 @@ data_dictionary_settings:
 
 preprocessing_settings:
     remove_stopwords :
-    stopwords_filepath: 'C:\\Users\\armengolkm\\Desktop\\Full Pipeline Test v1.1.0\\MetaMap_Settings_StopWords.csv'
+    stopwords_filepath:
     use_cheatsheet :
     cheatsheet_filepath:
 

diff --git a/ddcuimap/configs/custom/pvd.yaml b/ddcuimap/configs/custom/pvd.yaml
@@ -14,7 +14,7 @@ data_dictionary_settings:
 
 preprocessing_settings:
     remove_stopwords : true
-    stopwords_filepath: 'C:\\Users\\armengolkm\\Desktop\\Full Pipeline Test v1.1.0\\MetaMap_Settings_StopWords.csv'
+    stopwords_filepath:
     use_cheatsheet : false
     cheatsheet_filepath:
 

diff --git a/ddcuimap/configs/custom/title_def.yaml b/ddcuimap/configs/custom/title_def.yaml
@@ -14,7 +14,7 @@ data_dictionary_settings:
 
 preprocessing_settings:
     remove_stopwords : false
-    stopwords_filepath: 'C:\\Users\\armengolkm\\Desktop\\Full Pipeline Test v1.1.0\\MetaMap_Settings_StopWords.csv'
+    stopwords_filepath:
     use_cheatsheet : false
     cheatsheet_filepath:
 

diff --git a/ddcuimap/curation/utils/curation_functions.py b/ddcuimap/curation/utils/curation_functions.py
@@ -226,11 +226,20 @@ def concat_cols_umls(df, umls_columns: list):
 # @task(name="Reordering examples dictionary columns")
 def reorder_cols(df, order: list):
     """Reorder columns"""
-
-    df = df[order]
+    order_exists = keep_existing_cols(df.columns, order)
+    df = df[order_exists]
     return df
 
 
+def keep_existing_cols(df_cols, cols_to_check: list):
+    """Keep existing columns"""
+    cols_incl = list(set(cols_to_check).intersection(df_cols))
+    cols_excl = list(set(cols_to_check).difference(df_cols))
+    cols = [x for x in df_cols if x not in cols_excl]
+    print(f"The following columns were not found and will be excluded: {cols_excl}")
+    return cols
+
+
 @task(name="Manual override of column values")
 def override_cols(df, override: dict):
     """Custom function to accommodate current bug in BRICS examples dictionary import process that wants multi-CUI concepts to have a single source terminology

diff --git a/ddcuimap/utils/process_data_dictionary.py → ...curation/utils/process_data_dictionary.py b/ddcuimap/utils/process_data_dictionary.py → ...curation/utils/process_data_dictionary.py
@@ -1,7 +1,7 @@
 import pandas as pd
 from prefect import flow, task
 
-from . import helper as helper
+from ddcuimap.utils import helper as helper
 from . import text_processing as tp
 
 

diff --git a/ddcuimap/utils/text_processing.py → ddcuimap/curation/utils/text_processing.py b/ddcuimap/utils/text_processing.py → ddcuimap/curation/utils/text_processing.py
diff --git a/ddcuimap/hydra_search/batch_hydra_query_pipeline.py b/ddcuimap/hydra_search/batch_hydra_query_pipeline.py
@@ -9,13 +9,13 @@
 from pathlib import Path
 
 import ddcuimap.utils.helper as helper
-import ddcuimap.utils.process_data_dictionary as proc_dd
+import ddcuimap.curation.utils.process_data_dictionary as proc_dd
 import ddcuimap.curation.utils.curation_functions as cur
 import ddcuimap.umls.batch_query_pipeline as umls
 import ddcuimap.metamap.batch_query_pipeline as mm
 import ddcuimap.semantic_search.batch_hybrid_query_pipeline as ss
 
-cfg = helper.compose_config.fn(overrides=["custom=hydra_base"])
+cfg_hydra = helper.compose_config.fn(overrides=["custom=hydra_base"])
 cfg_umls = helper.compose_config.fn(overrides=["custom=de", "apis=config_umls_api"])
 cfg_mm = helper.compose_config.fn(overrides=["custom=de", "apis=config_metamap_api"])
 cfg_ss = helper.compose_config.fn(
@@ -32,73 +32,86 @@
     flow_run_name="Running UMLS/MetaMap/Semantic Search hydra search pipeline",
     log_prints=True,
 )
-def run_hydra_batch(cfg, cfg_umls, cfg_mm, cfg_ss, **kwargs):
+def run_hydra_batch(cfg_hydra, **kwargs):
     # LOAD DATA DICTIONARY FILE
-    df_dd, fp_dd = proc_dd.load_data_dictionary(cfg)
+    df_dd, fp_dd = proc_dd.load_data_dictionary(cfg_hydra)
 
     # CREATE STEP 1 DIRECTORY
     dir_step1 = helper.create_folder.fn(
         Path(fp_dd).parent.joinpath(
-            f"{cfg.custom.curation_settings.file_settings.directory_prefix}_Step-1_Hydra-search"
+            f"{cfg_hydra.custom.curation_settings.file_settings.directory_prefix}_Step-1_Hydra-search"
         )
     )
 
+    # STORE PIPELINE RESULTS
+    cat_dfs = []
+
     ## UMLS API ##
-    dir_step1_umls = helper.create_folder(
-        Path(dir_step1).joinpath(
-            f"{cfg.custom.curation_settings.file_settings.directory_prefix}_Step-1_umls-api-search"
+    cfg_umls = kwargs.get("cfg_umls")
+    if cfg_umls:
+        dir_step1_umls = helper.create_folder(
+            Path(dir_step1).joinpath(
+                f"{cfg_hydra.custom.curation_settings.file_settings.directory_prefix}_Step-1_umls-api-search"
+            )
         )
-    )
-    df_umls, cfg_umls = umls.run_umls_batch(
-        cfg_umls, df_dd=df_dd, dir_step1=dir_step1_umls
-    )
+        df_umls, cfg_umls = umls.run_umls_batch(
+            cfg_umls, df_dd=df_dd, dir_step1=dir_step1_umls
+        )
+        cat_dfs.append(df_umls)
 
     ## METAMAP API ##
-    dir_step1_mm = helper.create_folder(
-        Path(dir_step1).joinpath(
-            f"{cfg.custom.curation_settings.file_settings.directory_prefix}_Step-1_metamap-search"
+    cfg_mm = kwargs.get("cfg_mm")
+    if cfg_mm:
+        dir_step1_mm = helper.create_folder(
+            Path(dir_step1).joinpath(
+                f"{cfg_hydra.custom.curation_settings.file_settings.directory_prefix}_Step-1_metamap-search"
+            )
         )
-    )
-    df_metamap, cfg_mm = mm.run_mm_batch(cfg_mm, df_dd=df_dd, dir_step1=dir_step1_mm)
+        df_metamap, cfg_mm = mm.run_mm_batch(
+            cfg_mm, df_dd=df_dd, dir_step1=dir_step1_mm
+        )
+        cat_dfs.append(df_metamap)
 
     ## SEMANTIC SEARCH ##
-
-    dir_step1_ss = helper.create_folder(
-        Path(dir_step1).joinpath(
-            f"{cfg.custom.curation_settings.file_settings.directory_prefix}_Step-1_hybrid-semantic-search_alpha={cfg_ss.semantic_search.query.alpha}"
+    cfg_ss = kwargs.get("cfg_ss")
+    if cfg_ss:
+        dir_step1_ss = helper.create_folder(
+            Path(dir_step1).joinpath(
+                f"{cfg_hydra.custom.curation_settings.file_settings.directory_prefix}_Step-1_hybrid-semantic-search_alpha={cfg_ss.semantic_search.query.alpha}"
+            )
         )
-    )
-    df_semantic_search, cfg_ss = ss.run_hybrid_ss_batch(
-        cfg_ss, df_dd=df_dd, dir_step1=dir_step1_ss
-    )
+        df_semantic_search, cfg_ss = ss.run_hybrid_ss_batch(
+            cfg_ss, df_dd=df_dd, dir_step1=dir_step1_ss
+        )
+        cat_dfs.append(df_semantic_search)
 
     ## COMBINE RESULTS ##
 
-    df_results = pd.concat(
-        [df_umls, df_metamap, df_semantic_search], axis=0, ignore_index=True
-    )
+    df_results = pd.concat(cat_dfs, axis=0, ignore_index=True)
     df_results.to_csv(Path(dir_step1).joinpath("hydra_search_results.csv"), index=False)
 
     # FORMAT CURATION DATAFRAME
-    df_dd_preprocessed = proc_dd.process_data_dictionary(df_dd, cfg)
-    pipeline_name = f"hydra-search (custom={cfg.custom.settings.custom_config})"
+    df_dd_preprocessed = proc_dd.process_data_dictionary(df_dd, cfg_hydra)
+    pipeline_name = f"hydra-search (custom={cfg_hydra.custom.settings.custom_config})"
     df_curation = cur.format_curation_dataframe(
-        df_dd, df_dd_preprocessed, pipeline_name, cfg
+        df_dd, df_dd_preprocessed, pipeline_name, cfg_hydra
     )
-    curation_cols = list(cfg.custom.curation_settings.information_columns) + [
+    curation_cols = list(cfg_hydra.custom.curation_settings.information_columns) + [
         "search_ID"
     ]
     df_curation = df_curation[curation_cols]
 
     ## CREATE CURATION FILE ##
     df_final = cur.create_curation_file(
-        dir_step1, df_dd, df_dd_preprocessed, df_curation, df_results, cfg
+        dir_step1, df_dd, df_dd_preprocessed, df_curation, df_results, cfg_hydra
     )
-    helper.save_config(cfg, dir_step1)
+    helper.save_config(cfg_hydra, dir_step1)
     print("FINISHED batch hydra search query pipeline!!!")
 
-    return df_final
+    return df_final, cfg_hydra
 
 
 if __name__ == "__main__":
-    df_final = run_hydra_batch(cfg, cfg_umls, cfg_mm, cfg_ss)
+    df_final, cfg_hydra = run_hydra_batch(
+        cfg_hydra, cfg_umls=cfg_umls, cfg_mm=cfg_mm, cfg_ss=cfg_ss
+    )  # TODO: maybe put module cfgs into a list
diff --git a/ddcuimap/metamap/batch_query_pipeline.py b/ddcuimap/metamap/batch_query_pipeline.py
@@ -8,7 +8,7 @@
 from prefect.task_runners import SequentialTaskRunner
 from pathlib import Path
 import ddcuimap.utils.helper as helper
-import ddcuimap.utils.process_data_dictionary as proc_dd
+import ddcuimap.curation.utils.process_data_dictionary as proc_dd
 
 # MetaMap API
 from ddcuimap.curation.utils import curation_functions as cur

diff --git a/ddcuimap/metamap/utils/metamap_query_processing_functions.py b/ddcuimap/metamap/utils/metamap_query_processing_functions.py
@@ -13,7 +13,7 @@
 from prefect.task_runners import SequentialTaskRunner
 
 from ddcuimap.metamap.skr_web_api import Submission
-from ddcuimap.utils.text_processing import (
+from ddcuimap.curation.utils.text_processing import (
     check_query_terms_valid,
     unescape_string,
 )