V13.3.0 rc (#468)

* Revise dashboards (#433) * Don't cache local install (#431) * Don't put cache dir * Remove -e * Update codeowners to use genie reviewers * Remove missing vital status and assay info sections * hotfix dockerfile * pip3 install no longer works in dockerfile for locally built packages * case-insensitive comparison * test case-insensitive comparison * black on genie_registry/assay.py only * black on genie/process_functions.py only * rename new test function * replace underscores with hyphens * add test for underscore versus hypen * black on genie_registry/assay.py * Modify tsa1, tsa2, ref maf error message (#438) * Modify tsa1, tsa2, ref maf error message * Fix tests * Add sample class filter (#441) * Add sample class filter * lint * Lint * only filter for public release * lint * Make sure processing pipeline doesn't fail for older releases that don't have SAMPLE_CLASS * Fix Docker build (#445) * Change docker tag and add depedency * comment in sdist * Update pandas version (#446) * Use iloc * Use pd.concat * Use pd.concat * Use pd.concat * Use pd.concat instead of append * Update genie/database_to_staging.py Co-authored-by: Haley Hunter-Zinck <[email protected]> * use pd.concat * Use pd.concat * lint * exclude tests * Use pd.concat * Use mask to replace values * Lint * append Co-authored-by: Haley Hunter-Zinck <[email protected]> * Add code of conduct (#448) * year or int of death is not applicable for living patients (#450) * year or int of death is not applicable for living patients * update tests for dead variable Co-authored-by: Thomas Yu <[email protected]> * support scheduled job secrets (#453) * Add documentation * Uncommit * Add in clinicalreported (#459) * [ETL-156, ETL-157, ETL-158] Create configuration dictionary for GENIE code (#455) * Add genie_config parameter * Remove unused variables * Remove unused parameters * Remove parameters * Edit doc * Comment * Add genie config to file format classes * Comment * Fix some tests * Fix test * Fix tests * Add test configuration * Fix tests * Lint * Update bin/input_to_database.py Co-authored-by: Haley Hunter-Zinck <[email protected]> * [ETL-159] Use GENIE config dict for processing (#460) * Remove dataBaseSynIdmapping * Remove the need for database mapping df * Remove need for database mapping df * Remove use of database mapping df * Fill in docs * Added quick value error check * Update hack text * Fix tests and lint * use genie config and utilize existing function * Remove unused function * Remove todos * Fix * [ETL-156] Use genie config in validation cli (#461) * Update validation cli code to use genie config * Fix tests * Add in genie config * Update genie/validate.py Co-authored-by: Haley Hunter-Zinck <[email protected]> * Fix test Co-authored-by: Haley Hunter-Zinck <[email protected]> Co-authored-by: Haley Hunter-Zinck <[email protected]> * allow for unknown interval but known year (#464) * Update version * [IBCDPE-178] Create ValidationResult class (#454) * Create validation cls to prepare for detailed sample based errors to be returned * Remove function * Lint * Fix tests * Add tests * is_valid() should be a function * Add type hint Co-authored-by: Haley Hunter-Zinck <[email protected]> Co-authored-by: Haley Hunter-Zinck <[email protected]>
Sage-Bionetworks · May 4, 2022 · bb38bba · bb38bba
1 parent 9f61bb0
commit bb38bba
Show file tree

Hide file tree

Showing 8 changed files with 125 additions and 75 deletions.
diff --git a/genie/__version__.py b/genie/__version__.py
@@ -1 +1 @@
-__version__ = "13.2.0"
+__version__ = "13.3.0"
diff --git a/genie/example_filetype_format.py b/genie/example_filetype_format.py
@@ -6,6 +6,49 @@
 logger = logging.getLogger(__name__)
 
 
+class ValidationResults:
+    """Validation results"""
+
+    def __init__(self, errors: str, warnings: str, detailed: str = None) -> None:
+        """
+        Args:
+            errors (str): errors for the file
+            warnings (str): warning for the file
+            detailed_errors (pd.DataFrame, optional): Dataframe of detailed row based
+                                                      error messages. Defaults to None.
+        """
+        self.errors = errors
+        self.warnings = warnings
+        self.detailed = detailed
+
+    def is_valid(self) -> bool:
+        """True if file is valid"""
+        return self.errors == ""
+
+    def collect_errors_and_warnings(self) -> str:
+        """Aggregates error and warnings into a string.
+
+        Returns:
+            str: A message containing errors and warnings.
+        """
+        # Complete error message
+        message = "----------------ERRORS----------------\n"
+        if self.errors == "":
+            message = "YOUR FILE IS VALIDATED!\n"
+            logger.info(message)
+        else:
+            for error in self.errors.split("\n"):
+                if error != "":
+                    logger.error(error)
+            message += self.errors
+        if self.warnings != "":
+            for warning in self.warnings.split("\n"):
+                if warning != "":
+                    logger.warning(warning)
+            message += "-------------WARNINGS-------------\n" + self.warnings
+        return message
+
+
 class FileTypeFormat(object):
 
     _process_kwargs = ["newPath", "databaseSynId"]
@@ -126,13 +169,13 @@ def process(self, filePath, **kwargs):
         path = self.process_steps(path_or_df, **mykwargs)
         return path
 
-    def _validate(self, df, **kwargs):
+    def _validate(self, df: pd.DataFrame, **kwargs) -> tuple:
         """
         This is the base validation function.
         By default, no validation occurs.
 
         Args:
-            df: A dataframe of the file
+            df (pd.DataFrame): A dataframe of the file
             kwargs: The kwargs are determined by self._validation_kwargs
 
         Returns:
@@ -142,9 +185,9 @@ def _validate(self, df, **kwargs):
         errors = ""
         warnings = ""
         logger.info("NO VALIDATION for %s files" % self._fileType)
-        return (errors, warnings)
+        return errors, warnings
 
-    def validate(self, filePathList, **kwargs):
+    def validate(self, filePathList, **kwargs) -> ValidationResults:
         """
         This is the main validation function.
         Every file type calls self._validate, which is different.
@@ -168,15 +211,14 @@ def validate(self, filePathList, **kwargs):
         try:
             df = self.read_file(filePathList)
         except Exception as e:
-            errors = "The file(s) ({filePathList}) cannot be read. Original error: {exception}".format(
-                filePathList=filePathList, exception=str(e)
+            errors = (
+                f"The file(s) ({filePathList}) cannot be read. Original error: {str(e)}"
             )
             warnings = ""
 
         if not errors:
             logger.info("VALIDATING %s" % os.path.basename(",".join(filePathList)))
             errors, warnings = self._validate(df, **mykwargs)
 
-        valid = errors == ""
-
-        return valid, errors, warnings
+        result_cls = ValidationResults(errors=errors, warnings=warnings)
+        return result_cls
diff --git a/genie/input_to_database.py b/genie/input_to_database.py
@@ -283,12 +283,13 @@ def validatefile(
     )
     filetype = validator.file_type
     if check_file_status["to_validate"]:
-        valid, message = validator.validate_single_file(
+        valid_cls, message = validator.validate_single_file(
             oncotree_link=genie_config["oncotreeLink"], nosymbol_check=False
         )
+
         logger.info("VALIDATION COMPLETE")
         input_status_list, invalid_errors_list = _get_status_and_error_list(
-            valid, message, entities
+            valid_cls.is_valid(), message, entities
         )
         # Send email the first time the file is invalid
         if invalid_errors_list:

diff --git a/genie/validate.py b/genie/validate.py
@@ -4,7 +4,7 @@
 import synapseclient
 from synapseclient.core.exceptions import SynapseHTTPError
 
-from . import config, process_functions
+from . import config, example_filetype_format, process_functions
 
 logger = logging.getLogger(__name__)
 
@@ -80,9 +80,10 @@ def validate_single_file(self, **kwargs):
         """
 
         if self.file_type not in self._format_registry:
-            valid = False
-            errors = "Your filename is incorrect! Please change your filename before you run the validator or specify --filetype if you are running the validator locally"
-            warnings = ""
+            valid_result_cls = example_filetype_format.ValidationResults(
+                errors="Your filename is incorrect! Please change your filename before you run the validator or specify --filetype if you are running the validator locally",
+                warnings="",
+            )
         else:
             mykwargs = {}
             for required_parameter in self._validate_kwargs:
@@ -99,14 +100,11 @@ def validate_single_file(self, **kwargs):
                 genie_config=self.genie_config,
             )
             filepathlist = [entity.path for entity in self.entitylist]
-            valid, errors, warnings = validator.validate(
-                filePathList=filepathlist, **mykwargs
-            )
+            valid_result_cls = validator.validate(filePathList=filepathlist, **mykwargs)
 
         # Complete error message
-        message = collect_errors_and_warnings(errors, warnings)
-
-        return (valid, message)
+        message = valid_result_cls.collect_errors_and_warnings()
+        return (valid_result_cls, message)
 
 
 # TODO: Remove this at some point
@@ -116,32 +114,22 @@ class GenieValidationHelper(ValidationHelper):
     _validate_kwargs = ["nosymbol_check"]
 
 
-def collect_errors_and_warnings(errors, warnings):
-    """Aggregates error and warnings into a string.
+def get_config(syn, synid):
+    """Gets Synapse database to Table mapping in dict
 
     Args:
-        errors: string of file errors, separated by new lines.
-        warnings: string of file warnings, separated by new lines.
+        syn: Synapse connection
+        synid: Synapse id of database mapping table
 
     Returns:
-        message - errors + warnings
+        dict: {'databasename': 'synid'}
+
     """
-    # Complete error message
-    message = "----------------ERRORS----------------\n"
-    if errors == "":
-        message = "YOUR FILE IS VALIDATED!\n"
-        logger.info(message)
-    else:
-        for error in errors.split("\n"):
-            if error != "":
-                logger.error(error)
-        message += errors
-    if warnings != "":
-        for warning in warnings.split("\n"):
-            if warning != "":
-                logger.warning(warning)
-        message += "-------------WARNINGS-------------\n" + warnings
-    return message
+    config = syn.tableQuery("SELECT * FROM {}".format(synid))
+    configdf = config.asDataFrame()
+    configdf.index = configdf["Database"]
+    config_dict = configdf.to_dict()
+    return config_dict["Id"]
 
 
 def _check_parentid_permission_container(syn, parentid):

diff --git a/genie_registry/clinical.py b/genie_registry/clinical.py
@@ -140,8 +140,14 @@ def _check_int_year_consistency(
     for str_val in string_vals:
         # n string values per row
         n_str = (clinicaldf[cols] == str_val).sum(axis=1)
-        if n_str.between(0, len(cols), inclusive="neither").any():
-            is_text_inconsistent = True
+        # year can be known with unknown interval value
+        # otherwise must be all numeric or the same text value
+        if str_val == "Unknown":
+            if ((n_str == 1) & (clinicaldf[interval_col] != "Unknown")).any():
+                is_text_inconsistent = True
+        else:
+            if n_str.between(0, len(cols), inclusive="neither").any():
+                is_text_inconsistent = True
 
     is_redaction_inconsistent = False
     # Check that the redacted values are consistent

diff --git a/tests/test_clinical.py b/tests/test_clinical.py
@@ -727,13 +727,13 @@ def test_remap_clinical_values(col):
 def test__check_int_year_consistency_valid():
     """Test valid vital status consistency"""
     testdf = pd.DataFrame(
-        {"INT_2": [1, 2, "Unknown"],
-         "YEAR_1": [1, 4, "Unknown"],
-         "FOO_3": [1, 3, "Unknown"]}
+        {"INT_2": [1, 2, "Unknown", "Unknown"],
+         "YEAR_1": [1, 4, "Unknown", 1],
+         "FOO_3": [1, 3, "Unknown", 1]}
     )
     error = genie_registry.clinical._check_int_year_consistency(
         clinicaldf=testdf,
-        cols=['INT_2', "YEAR_1"],
+        cols=["INT_2", "YEAR_1"],
         string_vals=["Unknown"]
     )
     assert error == ""
@@ -759,7 +759,7 @@ def test__check_int_year_consistency_valid():
         (
             pd.DataFrame(
                 {"INT_2": [1, "Unknown", "Unknown"],
-                 "YEAR_1": [1, 4, "Unknown"]}
+                 "YEAR_1": [1, "Not Applicable", "Unknown"]}
             ),
             "Patient: you have inconsistent text values in INT_2, YEAR_1.\n"
         ),
@@ -780,10 +780,17 @@ def test__check_int_year_consistency_valid():
         (
             pd.DataFrame(
                 {"INT_2": ["<6570", "Unknown", "Unknown"],
-                 "YEAR_1": [1, 3, "Unknown"]}
+                 "YEAR_1": [1, "Not Applicable", "Unknown"]}
             ),
             "Patient: you have inconsistent redaction and text values in "
             "INT_2, YEAR_1.\n"
+        ),
+        (
+            pd.DataFrame(
+                {"INT_2": ["12345", "Unknown", "Unknown"],
+                 "YEAR_1": ["Unknown", "Unknown", "Unknown"]}
+            ),
+            "Patient: you have inconsistent text values in INT_2, YEAR_1.\n"
         )
     ]
 )

diff --git a/tests/test_input_to_database.py b/tests/test_input_to_database.py
@@ -7,7 +7,9 @@
 import synapseclient
 import synapseutils
 
-from genie import input_to_database, process_functions, process_mutation
+from genie import (
+    input_to_database, example_filetype_format, process_functions, process_mutation
+)
 from genie_registry.clinical import Clinical
 from genie.validate import GenieValidationHelper
 
@@ -356,7 +358,7 @@ def test_valid_validatefile(genie_config):
     entity.modifiedBy = '333'
     entity.createdBy = '444'
     entities = [entity]
-    valid = True
+    valid_results = example_filetype_format.ValidationResults(errors='', warnings='')
     message = "Is valid"
     filetype = "clinical"
     # Only a list is returned as opposed a list of lists when there are
@@ -373,7 +375,7 @@ def test_valid_validatefile(genie_config):
                                     'error_list': [],
                                     'to_validate': True}) as patch_check, \
          patch.object(GenieValidationHelper, "validate_single_file",
-                      return_value=(valid, message)) as patch_validate,\
+                      return_value=(valid_results, message)) as patch_validate,\
          patch.object(input_to_database, "_get_status_and_error_list",
                       return_value=status_error_list_results) as patch_staterror_list,\
          patch.object(input_to_database,
@@ -391,7 +393,7 @@ def test_valid_validatefile(genie_config):
             validation_statusdf, error_trackerdf, entities)
         patch_determine_filetype.assert_called_once()
         patch_staterror_list.assert_called_once_with(
-            valid, message, entities)
+            valid_results.is_valid(), message, entities)
         patch_send_email.assert_not_called()
 
 
@@ -410,7 +412,9 @@ def test_invalid_validatefile(genie_config):
     entity.modifiedBy = '333'
     entity.createdBy = '444'
     entities = [entity]
-    valid = False
+    valid_results = example_filetype_format.ValidationResults(
+        errors='Is invalid', warnings=''
+    )
     message = "Is invalid"
     filetype = "clinical"
     status_error_list_results = ([{'entity': entity, 'status': 'INVALID'}],
@@ -444,7 +448,7 @@ def test_invalid_validatefile(genie_config):
                                     'error_list': [],
                                     'to_validate': True}) as patch_check, \
          patch.object(GenieValidationHelper, "validate_single_file",
-                      return_value=(valid, message)) as patch_validate,\
+                      return_value=(valid_results, message)) as patch_validate,\
          patch.object(input_to_database, "_get_status_and_error_list",
                       return_value=status_error_list_results) as patch_staterror_list:
 
@@ -460,7 +464,7 @@ def test_invalid_validatefile(genie_config):
             validation_statusdf, error_trackerdf, entities)
         patch_determine_filetype.assert_called_once()
         patch_staterror_list.assert_called_once_with(
-            valid, message, entities)
+            valid_results.is_valid(), message, entities)
 
 
 def test_already_validated_validatefile():