Skip to content

Commit

Permalink
V13.3.0 rc (#468)
Browse files Browse the repository at this point in the history
* Revise dashboards (#433)

* Don't cache local install (#431)

* Don't put cache dir

* Remove -e

* Update codeowners to use genie reviewers

* Remove missing vital status and assay info sections

* hotfix dockerfile

* pip3 install no longer works in dockerfile for locally built packages

* case-insensitive comparison

* test case-insensitive comparison

* black on genie_registry/assay.py only

* black on genie/process_functions.py only

* rename new test function

* replace underscores with hyphens

* add test for underscore versus hypen

* black on genie_registry/assay.py

* Modify tsa1, tsa2, ref maf error message (#438)

* Modify tsa1, tsa2, ref maf error message

* Fix tests

* Add sample class filter (#441)

* Add sample class filter

* lint

* Lint

* only filter for public release

* lint

* Make sure processing pipeline doesn't fail for older releases that don't have SAMPLE_CLASS

* Fix Docker build (#445)

* Change docker tag and add depedency

* comment in sdist

* Update pandas version (#446)

* Use iloc

* Use pd.concat

* Use pd.concat

* Use pd.concat

* Use pd.concat instead of append

* Update genie/database_to_staging.py

Co-authored-by: Haley Hunter-Zinck <[email protected]>

* use pd.concat

* Use pd.concat

* lint

* exclude tests

* Use pd.concat

* Use mask to replace values

* Lint

* append

Co-authored-by: Haley Hunter-Zinck <[email protected]>

* Add code of conduct (#448)

* year or int of death is not applicable for living patients (#450)

* year or int of death is not applicable for living patients

* update tests for dead variable

Co-authored-by: Thomas Yu <[email protected]>

* support scheduled job secrets (#453)

* Add documentation

* Uncommit

* Add in clinicalreported (#459)

* [ETL-156, ETL-157, ETL-158] Create configuration dictionary for GENIE code (#455)

* Add genie_config parameter

* Remove unused variables

* Remove unused parameters

* Remove parameters

* Edit doc

* Comment

* Add genie config to file format classes

* Comment

* Fix some tests

* Fix test

* Fix tests

* Add test configuration

* Fix tests

* Lint

* Update bin/input_to_database.py

Co-authored-by: Haley Hunter-Zinck <[email protected]>

* [ETL-159] Use GENIE config dict for processing (#460)

* Remove dataBaseSynIdmapping

* Remove the need for database mapping df

* Remove need for database mapping df

* Remove use of database mapping df

* Fill in docs

* Added quick value error check

* Update hack text

* Fix tests and lint

* use genie config and utilize existing function

* Remove unused function

* Remove todos

* Fix

* [ETL-156] Use genie config in validation cli (#461)

* Update validation cli code to use genie config

* Fix tests

* Add in genie config

* Update genie/validate.py

Co-authored-by: Haley Hunter-Zinck <[email protected]>

* Fix test

Co-authored-by: Haley Hunter-Zinck <[email protected]>

Co-authored-by: Haley Hunter-Zinck <[email protected]>

* allow for unknown interval but known year (#464)

* Update version

* [IBCDPE-178] Create ValidationResult class  (#454)

* Create validation cls to prepare for detailed sample based errors to be returned

* Remove function

* Lint

* Fix tests

* Add tests

* is_valid() should be a function

* Add type hint

Co-authored-by: Haley Hunter-Zinck <[email protected]>
Co-authored-by: Haley Hunter-Zinck <[email protected]>
  • Loading branch information
3 people authored May 4, 2022
1 parent 9f61bb0 commit bb38bba
Show file tree
Hide file tree
Showing 8 changed files with 125 additions and 75 deletions.
2 changes: 1 addition & 1 deletion genie/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "13.2.0"
__version__ = "13.3.0"
60 changes: 51 additions & 9 deletions genie/example_filetype_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,49 @@
logger = logging.getLogger(__name__)


class ValidationResults:
"""Validation results"""

def __init__(self, errors: str, warnings: str, detailed: str = None) -> None:
"""
Args:
errors (str): errors for the file
warnings (str): warning for the file
detailed_errors (pd.DataFrame, optional): Dataframe of detailed row based
error messages. Defaults to None.
"""
self.errors = errors
self.warnings = warnings
self.detailed = detailed

def is_valid(self) -> bool:
"""True if file is valid"""
return self.errors == ""

def collect_errors_and_warnings(self) -> str:
"""Aggregates error and warnings into a string.
Returns:
str: A message containing errors and warnings.
"""
# Complete error message
message = "----------------ERRORS----------------\n"
if self.errors == "":
message = "YOUR FILE IS VALIDATED!\n"
logger.info(message)
else:
for error in self.errors.split("\n"):
if error != "":
logger.error(error)
message += self.errors
if self.warnings != "":
for warning in self.warnings.split("\n"):
if warning != "":
logger.warning(warning)
message += "-------------WARNINGS-------------\n" + self.warnings
return message


class FileTypeFormat(object):

_process_kwargs = ["newPath", "databaseSynId"]
Expand Down Expand Up @@ -126,13 +169,13 @@ def process(self, filePath, **kwargs):
path = self.process_steps(path_or_df, **mykwargs)
return path

def _validate(self, df, **kwargs):
def _validate(self, df: pd.DataFrame, **kwargs) -> tuple:
"""
This is the base validation function.
By default, no validation occurs.
Args:
df: A dataframe of the file
df (pd.DataFrame): A dataframe of the file
kwargs: The kwargs are determined by self._validation_kwargs
Returns:
Expand All @@ -142,9 +185,9 @@ def _validate(self, df, **kwargs):
errors = ""
warnings = ""
logger.info("NO VALIDATION for %s files" % self._fileType)
return (errors, warnings)
return errors, warnings

def validate(self, filePathList, **kwargs):
def validate(self, filePathList, **kwargs) -> ValidationResults:
"""
This is the main validation function.
Every file type calls self._validate, which is different.
Expand All @@ -168,15 +211,14 @@ def validate(self, filePathList, **kwargs):
try:
df = self.read_file(filePathList)
except Exception as e:
errors = "The file(s) ({filePathList}) cannot be read. Original error: {exception}".format(
filePathList=filePathList, exception=str(e)
errors = (
f"The file(s) ({filePathList}) cannot be read. Original error: {str(e)}"
)
warnings = ""

if not errors:
logger.info("VALIDATING %s" % os.path.basename(",".join(filePathList)))
errors, warnings = self._validate(df, **mykwargs)

valid = errors == ""

return valid, errors, warnings
result_cls = ValidationResults(errors=errors, warnings=warnings)
return result_cls
5 changes: 3 additions & 2 deletions genie/input_to_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,12 +283,13 @@ def validatefile(
)
filetype = validator.file_type
if check_file_status["to_validate"]:
valid, message = validator.validate_single_file(
valid_cls, message = validator.validate_single_file(
oncotree_link=genie_config["oncotreeLink"], nosymbol_check=False
)

logger.info("VALIDATION COMPLETE")
input_status_list, invalid_errors_list = _get_status_and_error_list(
valid, message, entities
valid_cls.is_valid(), message, entities
)
# Send email the first time the file is invalid
if invalid_errors_list:
Expand Down
50 changes: 19 additions & 31 deletions genie/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import synapseclient
from synapseclient.core.exceptions import SynapseHTTPError

from . import config, process_functions
from . import config, example_filetype_format, process_functions

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -80,9 +80,10 @@ def validate_single_file(self, **kwargs):
"""

if self.file_type not in self._format_registry:
valid = False
errors = "Your filename is incorrect! Please change your filename before you run the validator or specify --filetype if you are running the validator locally"
warnings = ""
valid_result_cls = example_filetype_format.ValidationResults(
errors="Your filename is incorrect! Please change your filename before you run the validator or specify --filetype if you are running the validator locally",
warnings="",
)
else:
mykwargs = {}
for required_parameter in self._validate_kwargs:
Expand All @@ -99,14 +100,11 @@ def validate_single_file(self, **kwargs):
genie_config=self.genie_config,
)
filepathlist = [entity.path for entity in self.entitylist]
valid, errors, warnings = validator.validate(
filePathList=filepathlist, **mykwargs
)
valid_result_cls = validator.validate(filePathList=filepathlist, **mykwargs)

# Complete error message
message = collect_errors_and_warnings(errors, warnings)

return (valid, message)
message = valid_result_cls.collect_errors_and_warnings()
return (valid_result_cls, message)


# TODO: Remove this at some point
Expand All @@ -116,32 +114,22 @@ class GenieValidationHelper(ValidationHelper):
_validate_kwargs = ["nosymbol_check"]


def collect_errors_and_warnings(errors, warnings):
"""Aggregates error and warnings into a string.
def get_config(syn, synid):
"""Gets Synapse database to Table mapping in dict
Args:
errors: string of file errors, separated by new lines.
warnings: string of file warnings, separated by new lines.
syn: Synapse connection
synid: Synapse id of database mapping table
Returns:
message - errors + warnings
dict: {'databasename': 'synid'}
"""
# Complete error message
message = "----------------ERRORS----------------\n"
if errors == "":
message = "YOUR FILE IS VALIDATED!\n"
logger.info(message)
else:
for error in errors.split("\n"):
if error != "":
logger.error(error)
message += errors
if warnings != "":
for warning in warnings.split("\n"):
if warning != "":
logger.warning(warning)
message += "-------------WARNINGS-------------\n" + warnings
return message
config = syn.tableQuery("SELECT * FROM {}".format(synid))
configdf = config.asDataFrame()
configdf.index = configdf["Database"]
config_dict = configdf.to_dict()
return config_dict["Id"]


def _check_parentid_permission_container(syn, parentid):
Expand Down
10 changes: 8 additions & 2 deletions genie_registry/clinical.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,14 @@ def _check_int_year_consistency(
for str_val in string_vals:
# n string values per row
n_str = (clinicaldf[cols] == str_val).sum(axis=1)
if n_str.between(0, len(cols), inclusive="neither").any():
is_text_inconsistent = True
# year can be known with unknown interval value
# otherwise must be all numeric or the same text value
if str_val == "Unknown":
if ((n_str == 1) & (clinicaldf[interval_col] != "Unknown")).any():
is_text_inconsistent = True
else:
if n_str.between(0, len(cols), inclusive="neither").any():
is_text_inconsistent = True

is_redaction_inconsistent = False
# Check that the redacted values are consistent
Expand Down
19 changes: 13 additions & 6 deletions tests/test_clinical.py
Original file line number Diff line number Diff line change
Expand Up @@ -727,13 +727,13 @@ def test_remap_clinical_values(col):
def test__check_int_year_consistency_valid():
"""Test valid vital status consistency"""
testdf = pd.DataFrame(
{"INT_2": [1, 2, "Unknown"],
"YEAR_1": [1, 4, "Unknown"],
"FOO_3": [1, 3, "Unknown"]}
{"INT_2": [1, 2, "Unknown", "Unknown"],
"YEAR_1": [1, 4, "Unknown", 1],
"FOO_3": [1, 3, "Unknown", 1]}
)
error = genie_registry.clinical._check_int_year_consistency(
clinicaldf=testdf,
cols=['INT_2', "YEAR_1"],
cols=["INT_2", "YEAR_1"],
string_vals=["Unknown"]
)
assert error == ""
Expand All @@ -759,7 +759,7 @@ def test__check_int_year_consistency_valid():
(
pd.DataFrame(
{"INT_2": [1, "Unknown", "Unknown"],
"YEAR_1": [1, 4, "Unknown"]}
"YEAR_1": [1, "Not Applicable", "Unknown"]}
),
"Patient: you have inconsistent text values in INT_2, YEAR_1.\n"
),
Expand All @@ -780,10 +780,17 @@ def test__check_int_year_consistency_valid():
(
pd.DataFrame(
{"INT_2": ["<6570", "Unknown", "Unknown"],
"YEAR_1": [1, 3, "Unknown"]}
"YEAR_1": [1, "Not Applicable", "Unknown"]}
),
"Patient: you have inconsistent redaction and text values in "
"INT_2, YEAR_1.\n"
),
(
pd.DataFrame(
{"INT_2": ["12345", "Unknown", "Unknown"],
"YEAR_1": ["Unknown", "Unknown", "Unknown"]}
),
"Patient: you have inconsistent text values in INT_2, YEAR_1.\n"
)
]
)
Expand Down
18 changes: 11 additions & 7 deletions tests/test_input_to_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
import synapseclient
import synapseutils

from genie import input_to_database, process_functions, process_mutation
from genie import (
input_to_database, example_filetype_format, process_functions, process_mutation
)
from genie_registry.clinical import Clinical
from genie.validate import GenieValidationHelper

Expand Down Expand Up @@ -356,7 +358,7 @@ def test_valid_validatefile(genie_config):
entity.modifiedBy = '333'
entity.createdBy = '444'
entities = [entity]
valid = True
valid_results = example_filetype_format.ValidationResults(errors='', warnings='')
message = "Is valid"
filetype = "clinical"
# Only a list is returned as opposed a list of lists when there are
Expand All @@ -373,7 +375,7 @@ def test_valid_validatefile(genie_config):
'error_list': [],
'to_validate': True}) as patch_check, \
patch.object(GenieValidationHelper, "validate_single_file",
return_value=(valid, message)) as patch_validate,\
return_value=(valid_results, message)) as patch_validate,\
patch.object(input_to_database, "_get_status_and_error_list",
return_value=status_error_list_results) as patch_staterror_list,\
patch.object(input_to_database,
Expand All @@ -391,7 +393,7 @@ def test_valid_validatefile(genie_config):
validation_statusdf, error_trackerdf, entities)
patch_determine_filetype.assert_called_once()
patch_staterror_list.assert_called_once_with(
valid, message, entities)
valid_results.is_valid(), message, entities)
patch_send_email.assert_not_called()


Expand All @@ -410,7 +412,9 @@ def test_invalid_validatefile(genie_config):
entity.modifiedBy = '333'
entity.createdBy = '444'
entities = [entity]
valid = False
valid_results = example_filetype_format.ValidationResults(
errors='Is invalid', warnings=''
)
message = "Is invalid"
filetype = "clinical"
status_error_list_results = ([{'entity': entity, 'status': 'INVALID'}],
Expand Down Expand Up @@ -444,7 +448,7 @@ def test_invalid_validatefile(genie_config):
'error_list': [],
'to_validate': True}) as patch_check, \
patch.object(GenieValidationHelper, "validate_single_file",
return_value=(valid, message)) as patch_validate,\
return_value=(valid_results, message)) as patch_validate,\
patch.object(input_to_database, "_get_status_and_error_list",
return_value=status_error_list_results) as patch_staterror_list:

Expand All @@ -460,7 +464,7 @@ def test_invalid_validatefile(genie_config):
validation_statusdf, error_trackerdf, entities)
patch_determine_filetype.assert_called_once()
patch_staterror_list.assert_called_once_with(
valid, message, entities)
valid_results.is_valid(), message, entities)


def test_already_validated_validatefile():
Expand Down
Loading

0 comments on commit bb38bba

Please sign in to comment.