Merge pull request #33 from Sage-Bionetworks/gh-29-validate-mapping

validation script for bpc to cbio mapping file
Sage-Bionetworks · Jan 29, 2022 · ea33ed8 · ea33ed8
2 parents b3f2b39 + d6fc134
commit ea33ed8
Show file tree

Hide file tree

Showing 3 changed files with 300 additions and 0 deletions.
diff --git a/scripts/config.yaml b/scripts/config.yaml
@@ -0,0 +1,50 @@
+dataset:
+  'Cancer-level dataset':
+    id: syn22296816  
+    file: ca_dx_derived.csv
+  'Patient-level dataset':
+    id: syn22296817
+    file: pt_derived.csv
+  'Regimen-Cancer level dataset':
+    id: syn22296818
+    file: ca_drugs_derived.csv
+  'Imaging-level dataset':
+    id: syn22296819
+    file: prissmm_image_derived.csv
+  'Pathology-report level dataset':
+    id: syn22296820
+    file: prissmm_path_derived.csv
+  'Med Onc Note level dataset':
+    id: syn22296822
+    file: prissmm_md_derived.csv
+  'Cancer panel test level dataset':
+    id: syn22296823
+    file: cpt_derived.csv
+  'Cancer-level index dataset': 
+    id: syn22314486
+    file: ca_dx_derived_index.csv
+  'Cancer-level non-index dataset': 
+    id: syn22314497
+    file: ca_dx_derived_non_index.csv
+  'Hemeonc dataset':
+    id: syn23561688
+    file: hemonc_mapping_cbio.csv
+  'PRISSMM Tumor Marker level dataset':
+    id: syn23561700
+    file: prissmm_tm_derived.csv
+  'Cancer-Directed Radiation Therapy dataset':
+    id: syn25931923  
+    file: ca_radtx_derived.csv
+check:
+  1: 
+    function: check_code_name_empty
+    implemented: 1
+    deprecated: 0
+    description: Code is empty.
+    request: Please remove the row or fill in the code name. 
+  2: 
+    function: check_code_name_absent
+    implemented: 1
+    deprecated: 0
+    description: Code does not exist in associated dataset.
+    request: Please check the code name and associated dataset. 
diff --git a/scripts/test_validate.py b/scripts/test_validate.py
@@ -0,0 +1,18 @@
+"""Test validate map"""
+import yaml
+
+from validate_map import *
+
+
+def test__function_map():
+    """Test that all functions referenced in the config file are listed in the function map."""
+    config = read_config("config.yaml")
+    fxn_map = create_function_map()
+
+    fxn_config = []
+    for check in config["check"]:
+        fxn_config.append(config["check"][check]["function"])
+
+    config_not_map = set(fxn_config) - set(fxn_map.keys())
+
+    assert len(config_not_map) == 0
diff --git a/scripts/validate_map.py b/scripts/validate_map.py
@@ -0,0 +1,232 @@
+"""
+Description: Validate the BPC to cBioPortal mapping file. 
+Author: Haley Hunter-Zinck
+Date: 2022-01-27
+"""
+
+import argparse
+import logging
+import re
+
+import pandas as pd
+import synapseclient
+from synapseclient import Synapse
+from synapseclient.core.exceptions import (
+    SynapseAuthenticationError,
+    SynapseNoCredentialsError,
+)
+import yaml
+
+
+def check_code_name_empty(df: pd.DataFrame, syn: Synapse, config: dict) -> list:
+    """Check for any code that is empty.
+     Args:
+      df: dataframe representing map
+      syn: Synapse object
+      config: configuration parameters
+    Returns:
+        dataframe with metadata on any empty codes.
+    """
+    empty = df.loc[pd.isna(df["code"])]["code"]
+    return list(empty)
+
+
+def check_code_name_absent(df: pd.DataFrame, syn: Synapse, config: dict) -> list:
+    """Check for any code that is not code name that
+    does not appear in its associated data file.
+    Args:
+        df: dataframe representing map
+        syn: Synapse object
+        config: configuration parameters
+    Returns:
+        dataframe with metadata on any missing codes.
+    """
+    absent = []
+    for dataset in config["dataset"]:
+        data = pd.read_csv(
+            syn.get(config["dataset"][dataset]["id"])["path"], low_memory=False
+        )
+        code_data = data.columns
+
+        # get codes associated with the dataset and of types derived or curated
+        code_map = list(
+            df.loc[
+                (
+                    (df["dataset"] == dataset)
+                    & (
+                        (df["data_type"].str.lower() == "derived")
+                        | (df["data_type"].str.lower() == "curated")
+                    )
+                )
+            ]["code"]
+        )
+
+        # do not check wildcard code names or NA code names
+        code_remove = []
+        for code in code_map:
+            if bool(re.match(r"^.+[*]$", str(code).strip())):
+                code_remove.append(code)
+            elif pd.isna(code):
+                code_remove.append(code)
+        for code in code_remove:
+            code_map.remove(code)
+
+        absent.extend(list(set(code_map) - set(code_data)))
+    return absent
+
+
+def format_result(codes: list, config: dict, check_no: int) -> pd.DataFrame:
+    """Format output for interpretable log file.
+    Args:
+        df: dataframe representing map
+        config: configuration parameters
+        check_no: check number for which to format results
+    Returns:
+        dataframe with additional metadata on any errors.
+    """
+    formatted = pd.DataFrame()
+    formatted["code"] = codes
+    formatted["check_no"] = str(check_no)
+    formatted["description"] = config["check"][check_no]["description"]
+    formatted["action"] = config["check"][check_no]["request"]
+    return formatted
+
+
+def create_function_map() -> dict:
+    fxns = {
+        "check_code_name_absent": check_code_name_absent,
+        "check_code_name_empty": check_code_name_empty,
+    }
+    return fxns
+
+
+def validate_map(
+    synapse_id: str, syn: Synapse, config: dict, version: int
+) -> pd.DataFrame:
+    """Run all implemented checks on mapping file.
+    Args:
+        synapse_id: Synapse ID of mapping file
+        syn: Synapse object
+        config: configuration parameters
+        version: Version number of Synapse ID
+    Returns:
+        dataframe with additional metadata on any errors.
+    """
+
+    errors = pd.DataFrame()
+    df = pd.DataFrame()
+    fxns = create_function_map()
+    if version == "None":
+        df = pd.read_csv(syn.get(synapse_id)["path"])
+    else:
+        df = pd.read_csv(syn.get(synapse_id, version=version)["path"])
+
+    for check_no in config["check"]:
+
+        logging.info(f"Check {check_no}...")
+
+        if (
+            config["check"][check_no]["implemented"]
+            and not config["check"][check_no]["deprecated"]
+        ):
+            fxn_name = config["check"][check_no]["function"]
+            result = fxns[fxn_name](df, syn, config)
+            errors = errors.append(format_result(result, config, check_no))
+            logging.info(f"  Found {errors.shape[0]} error(s).")
+        else:
+            logging.info("  Check deprecated or not implemented.")
+
+    errors.insert(0, "issue", range(1, errors.shape[0] + 1, 1))
+
+    return errors
+
+
+def build_parser():
+    parser = argparse.ArgumentParser(
+        description="Checks validity of BPC to cBioPortal mapping file "
+    )
+    parser.add_argument(
+        "synapse_id",
+        metavar="SYNAPSE_ID",
+        type=str,
+        help="Synapse ID of mapping file",
+    )
+    parser.add_argument(
+        "--version",
+        "-v",
+        metavar="VERSION",
+        type=str,
+        default="None",
+        help="Synapse entity version number " "(default: current)",
+    )
+    parser.add_argument(
+        "--outfile",
+        "-o",
+        metavar="OUTFILE",
+        type=str,
+        default="output.csv",
+        help="Name of output file " "(default: %(default)s)",
+    )
+    parser.add_argument(
+        "--log",
+        "-l",
+        type=str,
+        choices=["debug", "info", "warning", "error"],
+        default="error",
+        help="Set logging output level " "(default: %(default)s)",
+    )
+    return parser
+
+
+def read_config(file: str) -> dict:
+    config = None
+    with open(file, "r") as stream:
+        try:
+            config = yaml.safe_load(stream)
+        except yaml.YAMLError as exc:
+            print(exc)
+    return config
+
+
+def synapse_login(synapse_config=synapseclient.client.CONFIG_FILE):
+    """Login to Synapse
+    Args:
+        synapse_config: Path to synapse configuration file.
+                        Defaults to ~/.synapseConfig
+    Returns:
+        Synapse connection
+    """
+    try:
+        syn = synapseclient.Synapse(skip_checks=True, configPath=synapse_config)
+        syn.login(silent=True)
+    except (SynapseNoCredentialsError, SynapseAuthenticationError):
+        raise ValueError(
+            "Login error: please make sure you have correctly "
+            "configured your client.  Instructions here: "
+            "https://help.synapse.org/docs/Client-Configuration.1985446156.html.  "
+            "You can also create a Synapse Personal Access Token and set it "
+            "as an environmental variable: "
+            "SYNAPSE_AUTH_TOKEN='<my_personal_access_token>'"
+        )
+    return syn
+
+
+def main():
+
+    args = build_parser().parse_args()
+    config = read_config("config.yaml")
+    syn = synapse_login()
+
+    numeric_level = getattr(logging, args.log.upper(), None)
+    if not isinstance(numeric_level, int):
+        raise ValueError("Invalid log level: %s" % args.log)
+    logging.basicConfig(level=numeric_level)
+
+    res = validate_map(args.synapse_id, syn, config, args.version)
+    res.to_csv(args.outfile, index=False)
+
+    logging.info(f"Output written to '{args.outfile}'")
+
+
+if __name__ == "__main__":
+    main()