Skip to content

Commit

Permalink
Merge pull request #33 from Sage-Bionetworks/gh-29-validate-mapping
Browse files Browse the repository at this point in the history
validation script for  bpc to cbio mapping file
  • Loading branch information
hhunterzinck authored Jan 29, 2022
2 parents b3f2b39 + d6fc134 commit ea33ed8
Show file tree
Hide file tree
Showing 3 changed files with 300 additions and 0 deletions.
50 changes: 50 additions & 0 deletions scripts/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
dataset:
'Cancer-level dataset':
id: syn22296816
file: ca_dx_derived.csv
'Patient-level dataset':
id: syn22296817
file: pt_derived.csv
'Regimen-Cancer level dataset':
id: syn22296818
file: ca_drugs_derived.csv
'Imaging-level dataset':
id: syn22296819
file: prissmm_image_derived.csv
'Pathology-report level dataset':
id: syn22296820
file: prissmm_path_derived.csv
'Med Onc Note level dataset':
id: syn22296822
file: prissmm_md_derived.csv
'Cancer panel test level dataset':
id: syn22296823
file: cpt_derived.csv
'Cancer-level index dataset':
id: syn22314486
file: ca_dx_derived_index.csv
'Cancer-level non-index dataset':
id: syn22314497
file: ca_dx_derived_non_index.csv
'Hemeonc dataset':
id: syn23561688
file: hemonc_mapping_cbio.csv
'PRISSMM Tumor Marker level dataset':
id: syn23561700
file: prissmm_tm_derived.csv
'Cancer-Directed Radiation Therapy dataset':
id: syn25931923
file: ca_radtx_derived.csv
check:
1:
function: check_code_name_empty
implemented: 1
deprecated: 0
description: Code is empty.
request: Please remove the row or fill in the code name.
2:
function: check_code_name_absent
implemented: 1
deprecated: 0
description: Code does not exist in associated dataset.
request: Please check the code name and associated dataset.
18 changes: 18 additions & 0 deletions scripts/test_validate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
"""Test validate map"""
import yaml

from validate_map import *


def test__function_map():
"""Test that all functions referenced in the config file are listed in the function map."""
config = read_config("config.yaml")
fxn_map = create_function_map()

fxn_config = []
for check in config["check"]:
fxn_config.append(config["check"][check]["function"])

config_not_map = set(fxn_config) - set(fxn_map.keys())

assert len(config_not_map) == 0
232 changes: 232 additions & 0 deletions scripts/validate_map.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,232 @@
"""
Description: Validate the BPC to cBioPortal mapping file.
Author: Haley Hunter-Zinck
Date: 2022-01-27
"""

import argparse
import logging
import re

import pandas as pd
import synapseclient
from synapseclient import Synapse
from synapseclient.core.exceptions import (
SynapseAuthenticationError,
SynapseNoCredentialsError,
)
import yaml


def check_code_name_empty(df: pd.DataFrame, syn: Synapse, config: dict) -> list:
"""Check for any code that is empty.
Args:
df: dataframe representing map
syn: Synapse object
config: configuration parameters
Returns:
dataframe with metadata on any empty codes.
"""
empty = df.loc[pd.isna(df["code"])]["code"]
return list(empty)


def check_code_name_absent(df: pd.DataFrame, syn: Synapse, config: dict) -> list:
"""Check for any code that is not code name that
does not appear in its associated data file.
Args:
df: dataframe representing map
syn: Synapse object
config: configuration parameters
Returns:
dataframe with metadata on any missing codes.
"""
absent = []
for dataset in config["dataset"]:
data = pd.read_csv(
syn.get(config["dataset"][dataset]["id"])["path"], low_memory=False
)
code_data = data.columns

# get codes associated with the dataset and of types derived or curated
code_map = list(
df.loc[
(
(df["dataset"] == dataset)
& (
(df["data_type"].str.lower() == "derived")
| (df["data_type"].str.lower() == "curated")
)
)
]["code"]
)

# do not check wildcard code names or NA code names
code_remove = []
for code in code_map:
if bool(re.match(r"^.+[*]$", str(code).strip())):
code_remove.append(code)
elif pd.isna(code):
code_remove.append(code)
for code in code_remove:
code_map.remove(code)

absent.extend(list(set(code_map) - set(code_data)))
return absent


def format_result(codes: list, config: dict, check_no: int) -> pd.DataFrame:
"""Format output for interpretable log file.
Args:
df: dataframe representing map
config: configuration parameters
check_no: check number for which to format results
Returns:
dataframe with additional metadata on any errors.
"""
formatted = pd.DataFrame()
formatted["code"] = codes
formatted["check_no"] = str(check_no)
formatted["description"] = config["check"][check_no]["description"]
formatted["action"] = config["check"][check_no]["request"]
return formatted


def create_function_map() -> dict:
fxns = {
"check_code_name_absent": check_code_name_absent,
"check_code_name_empty": check_code_name_empty,
}
return fxns


def validate_map(
synapse_id: str, syn: Synapse, config: dict, version: int
) -> pd.DataFrame:
"""Run all implemented checks on mapping file.
Args:
synapse_id: Synapse ID of mapping file
syn: Synapse object
config: configuration parameters
version: Version number of Synapse ID
Returns:
dataframe with additional metadata on any errors.
"""

errors = pd.DataFrame()
df = pd.DataFrame()
fxns = create_function_map()
if version == "None":
df = pd.read_csv(syn.get(synapse_id)["path"])
else:
df = pd.read_csv(syn.get(synapse_id, version=version)["path"])

for check_no in config["check"]:

logging.info(f"Check {check_no}...")

if (
config["check"][check_no]["implemented"]
and not config["check"][check_no]["deprecated"]
):
fxn_name = config["check"][check_no]["function"]
result = fxns[fxn_name](df, syn, config)
errors = errors.append(format_result(result, config, check_no))
logging.info(f" Found {errors.shape[0]} error(s).")
else:
logging.info(" Check deprecated or not implemented.")

errors.insert(0, "issue", range(1, errors.shape[0] + 1, 1))

return errors


def build_parser():
parser = argparse.ArgumentParser(
description="Checks validity of BPC to cBioPortal mapping file "
)
parser.add_argument(
"synapse_id",
metavar="SYNAPSE_ID",
type=str,
help="Synapse ID of mapping file",
)
parser.add_argument(
"--version",
"-v",
metavar="VERSION",
type=str,
default="None",
help="Synapse entity version number " "(default: current)",
)
parser.add_argument(
"--outfile",
"-o",
metavar="OUTFILE",
type=str,
default="output.csv",
help="Name of output file " "(default: %(default)s)",
)
parser.add_argument(
"--log",
"-l",
type=str,
choices=["debug", "info", "warning", "error"],
default="error",
help="Set logging output level " "(default: %(default)s)",
)
return parser


def read_config(file: str) -> dict:
config = None
with open(file, "r") as stream:
try:
config = yaml.safe_load(stream)
except yaml.YAMLError as exc:
print(exc)
return config


def synapse_login(synapse_config=synapseclient.client.CONFIG_FILE):
"""Login to Synapse
Args:
synapse_config: Path to synapse configuration file.
Defaults to ~/.synapseConfig
Returns:
Synapse connection
"""
try:
syn = synapseclient.Synapse(skip_checks=True, configPath=synapse_config)
syn.login(silent=True)
except (SynapseNoCredentialsError, SynapseAuthenticationError):
raise ValueError(
"Login error: please make sure you have correctly "
"configured your client. Instructions here: "
"https://help.synapse.org/docs/Client-Configuration.1985446156.html. "
"You can also create a Synapse Personal Access Token and set it "
"as an environmental variable: "
"SYNAPSE_AUTH_TOKEN='<my_personal_access_token>'"
)
return syn


def main():

args = build_parser().parse_args()
config = read_config("config.yaml")
syn = synapse_login()

numeric_level = getattr(logging, args.log.upper(), None)
if not isinstance(numeric_level, int):
raise ValueError("Invalid log level: %s" % args.log)
logging.basicConfig(level=numeric_level)

res = validate_map(args.synapse_id, syn, config, args.version)
res.to_csv(args.outfile, index=False)

logging.info(f"Output written to '{args.outfile}'")


if __name__ == "__main__":
main()

0 comments on commit ea33ed8

Please sign in to comment.