diff --git a/.github/workflows/pdoc.yml b/.github/workflows/pdoc.yml index f2d8a4e41..818bd3a61 100644 --- a/.github/workflows/pdoc.yml +++ b/.github/workflows/pdoc.yml @@ -28,7 +28,7 @@ jobs: runs-on: ubuntu-latest env: POETRY_VERSION: 1.3.0 - PYTHON_VERSION: 3.10 + PYTHON_VERSION: "3.10" steps: #---------------------------------------------- diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index bbf8b9e95..0e5eaec9e 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -66,8 +66,8 @@ jobs: curl -sSL https://install.python-poetry.org \ | python3 - --version ${{ env.POETRY_VERSION }}; poetry config virtualenvs.create true; - poetry config virtualenvs.in-project false; - + poetry config virtualenvs.in-project true; + #---------------------------------------------- # install dependencies and root project #---------------------------------------------- diff --git a/schematic/models/validate_attribute.py b/schematic/models/validate_attribute.py index 5a562e416..e196bbe14 100644 --- a/schematic/models/validate_attribute.py +++ b/schematic/models/validate_attribute.py @@ -11,8 +11,8 @@ import pandas as pd import requests from jsonschema import ValidationError -from synapseclient.core.exceptions import SynapseNoCredentialsError from synapseclient import File +from synapseclient.core.exceptions import SynapseNoCredentialsError from schematic.schemas.data_model_graph import DataModelGraphExplorer from schematic.store.synapse import SynapseStorage @@ -479,10 +479,17 @@ def generate_filename_error( Errors: list[str] Error details for further storage. warnings: list[str] Warning details for further storage. """ - if error_type == "path does not exist": - error_message = f"The file path '{invalid_entry}' on row {row_num} does not exist in the file view." - elif error_type == "mismatched entityId": - error_message = f"The entityId for file path '{invalid_entry}' on row {row_num} does not match the entityId for the file in the file view" + error_messages = { + "mismatched entityId": f"The entityId for file path '{invalid_entry}' on row {row_num}" + " does not match the entityId for the file in the file view.", + "path does not exist": f"The file path '{invalid_entry}' on row {row_num} does not exist in the file view.", + "entityId does not exist": f"The entityId for file path '{invalid_entry}' on row {row_num}" + " does not exist in the file view.", + "missing entityId": f"The entityId is missing for file path '{invalid_entry}' on row {row_num}.", + } + error_message = error_messages.get(error_type, None) + if not error_message: + raise KeyError(f"Unsupported error type provided: '{error_type}'") error_list, warning_list = GenerateError.raise_and_store_message( dmge=dmge, @@ -2075,12 +2082,11 @@ def filename_validation( fileview = self.synStore.storageFileviewTable.reset_index(drop=True) # filename in dataset? files_in_view = manifest["Filename"].isin(fileview["path"]) + entity_ids_in_view = manifest["entityId"].isin(fileview["id"]) # filenames match with entity IDs in dataset joined_df = manifest.merge( - fileview, how="outer", left_on="Filename", right_on="path" + fileview, how="left", left_on="Filename", right_on="path" ) - # cover case where there are more files in dataset than in manifest - joined_df = joined_df.loc[~joined_df["Component"].isna()].reset_index(drop=True) entity_id_match = joined_df["id"] == joined_df["entityId"] @@ -2089,6 +2095,14 @@ def filename_validation( manifest_with_errors["Error"] = pd.NA manifest_with_errors.loc[~entity_id_match, "Error"] = "mismatched entityId" manifest_with_errors.loc[~files_in_view, "Error"] = "path does not exist" + manifest_with_errors.loc[ + ~entity_ids_in_view, "Error" + ] = "entityId does not exist" + manifest_with_errors.loc[ + (manifest_with_errors["entityId"].isna()) + | (manifest_with_errors["entityId"] == ""), + "Error", + ] = "missing entityId" # Generate errors invalid_entries = manifest_with_errors.loc[ @@ -2098,7 +2112,8 @@ def filename_validation( vr_errors, vr_warnings = GenerateError.generate_filename_error( val_rule=val_rule, attribute_name="Filename", - row_num=str(index), + # +2 to make consistent with other validation functions + row_num=str(index + 2), invalid_entry=data["Filename"], error_type=data["Error"], dmge=self.dmge, diff --git a/tests/data/mock_manifests/InvalidFilenameManifest.csv b/tests/data/mock_manifests/InvalidFilenameManifest.csv index 211f2a12c..bc2619e55 100644 --- a/tests/data/mock_manifests/InvalidFilenameManifest.csv +++ b/tests/data/mock_manifests/InvalidFilenameManifest.csv @@ -1,6 +1,7 @@ Component,Filename,entityId MockFilename,schematic - main/MockFilenameComponent/txt1.txt,syn61682653 MockFilename,schematic - main/MockFilenameComponent/txt2.txt,syn61682660 -MockFilename,schematic - main/MockFilenameComponent/txt3.txt,syn61682662 +MockFilename,schematic - main/MockFilenameComponent/txt3.txt,syn61682653 +MockFilename,schematic - main/MockFilenameComponent/this_file_does_not_exist.txt,syn61682653 MockFilename,schematic - main/MockFilenameComponent/txt4.txt,syn6168265 -MockFilename,schematic - main/MockFilenameComponent/txt5.txt, +MockFilename,schematic - main/MockFilenameComponent/txt6.txt, diff --git a/tests/test_validation.py b/tests/test_validation.py index cf0b70aaa..cdd6766c0 100644 --- a/tests/test_validation.py +++ b/tests/test_validation.py @@ -693,14 +693,13 @@ def test_filename_manifest(self, helpers, dmge): project_scope=["syn23643250"], dataset_scope="syn61682648", ) - # Check errors assert ( GenerateError.generate_filename_error( - val_rule="filenameExists", + val_rule="filenameExists syn61682648", attribute_name="Filename", - row_num="3", - invalid_entry="schematic - main/MockFilenameComponent/txt4.txt", + row_num="4", + invalid_entry="schematic - main/MockFilenameComponent/txt3.txt", error_type="mismatched entityId", dmge=dmge, )[0] @@ -709,17 +708,41 @@ def test_filename_manifest(self, helpers, dmge): assert ( GenerateError.generate_filename_error( - val_rule="filenameExists", + val_rule="filenameExists syn61682648", attribute_name="Filename", - row_num="4", - invalid_entry="schematic - main/MockFilenameComponent/txt5.txt", + row_num="5", + invalid_entry="schematic - main/MockFilenameComponent/this_file_does_not_exist.txt", error_type="path does not exist", dmge=dmge, )[0] in errors ) - assert len(errors) == 2 + assert ( + GenerateError.generate_filename_error( + val_rule="filenameExists syn61682648", + attribute_name="Filename", + row_num="6", + invalid_entry="schematic - main/MockFilenameComponent/txt4.txt", + error_type="entityId does not exist", + dmge=dmge, + )[0] + in errors + ) + + assert ( + GenerateError.generate_filename_error( + val_rule="filenameExists syn61682648", + attribute_name="Filename", + row_num="7", + invalid_entry="schematic - main/MockFilenameComponent/txt6.txt", + error_type="missing entityId", + dmge=dmge, + )[0] + in errors + ) + + assert len(errors) == 4 assert len(warnings) == 0 def test_filename_manifest_exception(self, helpers, dmge): diff --git a/tests/unit/test_validate_attribute.py b/tests/unit/test_validate_attribute.py index 26ab9f167..d782fab12 100644 --- a/tests/unit/test_validate_attribute.py +++ b/tests/unit/test_validate_attribute.py @@ -1,15 +1,15 @@ """Unit testing for the ValidateAttribute class""" from typing import Generator -from unittest.mock import patch -import pytest +from unittest.mock import Mock, patch -from pandas import Series, DataFrame, concat import numpy as np +import pytest +from pandas import DataFrame, Series, concat -from schematic.models.validate_attribute import ValidateAttribute -from schematic.schemas.data_model_graph import DataModelGraphExplorer import schematic.models.validate_attribute +from schematic.models.validate_attribute import GenerateError, ValidateAttribute +from schematic.schemas.data_model_graph import DataModelGraphExplorer # pylint: disable=protected-access # pylint: disable=too-many-public-methods @@ -99,6 +99,53 @@ } ) +TEST_DF_FILEVIEW = DataFrame( + { + "id": ["syn1", "syn2", "syn3"], + "path": ["test1.txt", "test2.txt", "test3.txt"], + } +) + +TEST_MANIFEST_GOOD = DataFrame( + { + "Component": ["Mockfilename", "Mockfilename", "Mockfilename"], + "Filename": ["test1.txt", "test2.txt", "test3.txt"], + "entityId": ["syn1", "syn2", "syn3"], + } +) + +TEST_MANIFEST_MISSING_ENTITY_ID = DataFrame( + { + "Component": ["Mockfilename", "Mockfilename", "Mockfilename"], + "Filename": ["test1.txt", "test2.txt", "test3.txt"], + "entityId": ["syn1", "syn2", ""], + } +) + +TEST_MANIFEST_FILENAME_NOT_IN_VIEW = DataFrame( + { + "Component": ["Mockfilename", "Mockfilename", "Mockfilename"], + "Filename": ["test1.txt", "test2.txt", "test_bad.txt"], + "entityId": ["syn1", "syn2", "syn3"], + } +) + +TEST_MANIFEST_ENTITY_ID_NOT_IN_VIEW = DataFrame( + { + "Component": ["Mockfilename", "Mockfilename", "Mockfilename"], + "Filename": ["test1.txt", "test2.txt", "test3.txt"], + "entityId": ["syn1", "syn2", "syn_bad"], + } +) + +TEST_MANIFEST_ENTITY_ID_MISMATCH = DataFrame( + { + "Component": ["Mockfilename", "Mockfilename", "Mockfilename"], + "Filename": ["test1.txt", "test2.txt", "test3.txt"], + "entityId": ["syn1", "syn2", "syn2"], + } +) + @pytest.fixture(name="va_obj") def fixture_va_obj( @@ -171,6 +218,93 @@ def fixture_cross_val_col_names() -> Generator[dict[str, str], None, None]: yield column_names +class TestGenerateError: + """Unit tests for the GenerateError class""" + + val_rule = "filenameExists syn123456" + attribute_name = "Filename" + row_num = "2" + invalid_entry = "test_file.txt" + + @pytest.mark.parametrize( + "error_type, expected_message", + [ + ( + "mismatched entityId", + "The entityId for file path 'test_file.txt' on row 2 does not match the entityId for the file in the file view.", + ), + ( + "path does not exist", + "The file path 'test_file.txt' on row 2 does not exist in the file view.", + ), + ( + "entityId does not exist", + "The entityId for file path 'test_file.txt' on row 2 does not exist in the file view.", + ), + ( + "missing entityId", + "The entityId is missing for file path 'test_file.txt' on row 2.", + ), + ], + ids=[ + "mismatched entityId", + "path does not exist", + "entityId does not exist", + "missing entityId", + ], + ) + def test_generate_filename_error( + self, dmge: DataModelGraphExplorer, error_type: str, expected_message: str + ): + with patch.object( + GenerateError, + "raise_and_store_message", + return_value=( + [ + self.row_num, + self.attribute_name, + expected_message, + self.invalid_entry, + ], + [], + ), + ) as mock_raise_and_store: + error_list, _ = GenerateError.generate_filename_error( + val_rule=self.val_rule, + attribute_name=self.attribute_name, + row_num=self.row_num, + invalid_entry=self.invalid_entry, + error_type=error_type, + dmge=dmge, + ) + mock_raise_and_store.assert_called_once_with( + dmge=dmge, + val_rule=self.val_rule, + error_row=self.row_num, + error_col=self.attribute_name, + error_message=expected_message, + error_val=self.invalid_entry, + ) + + assert len(error_list) == 4 + assert error_list[2] == expected_message + + def test_generate_filename_error_unsupported_error_type( + self, dmge: DataModelGraphExplorer + ): + with pytest.raises( + KeyError, match="Unsupported error type provided: 'unsupported error type'" + ) as exc_info: + GenerateError.generate_filename_error( + dmge=dmge, + val_rule=self.val_rule, + attribute_name=self.attribute_name, + row_num=self.row_num, + invalid_entry=self.invalid_entry, + error_type="unsupported error type", + ) + + class TestValidateAttributeObject: """Testing for ValidateAttribute class with all Synapse calls mocked""" @@ -532,6 +666,120 @@ def test_cross_validation_value_match_none_rules_errors( assert errors == [] assert len(warnings) == 1 + @pytest.mark.parametrize( + "manifest, expected_errors, expected_warnings, generates_error", + [ + (TEST_MANIFEST_GOOD, [], [], False), + ( + TEST_MANIFEST_MISSING_ENTITY_ID, + [ + [ + "4", + "Filename", + "The entityId for file path 'test3.txt' on row 4 does not exist in the file view.", + "test3.txt", + ] + ], + [], + True, + ), + ( + TEST_MANIFEST_FILENAME_NOT_IN_VIEW, + [ + [ + "4", + "Filename", + "The file path 'test_bad.txt' on row 4 does not exist in the file view.", + "test_bad.txt", + ] + ], + [], + True, + ), + ( + TEST_MANIFEST_ENTITY_ID_NOT_IN_VIEW, + [ + [ + "4", + "Filename", + "The entityId for file path 'test3.txt' on row 4 does not exist in the file view.", + "test3.txt", + ] + ], + [], + True, + ), + ( + TEST_MANIFEST_ENTITY_ID_MISMATCH, + [ + [ + "4", + "Filename", + "The entityId for file path 'test3.txt' on row 4 does not match " + "the entityId for the file in the file view.", + "test3.txt", + ] + ], + [], + True, + ), + ], + ids=[ + "valid_manifest", + "missing_entity_id", + "bad_filename", + "bad_entity_id", + "entity_id_mismatch", + ], + ) + def test_filename_validation( + self, + va_obj: ValidateAttribute, + manifest: DataFrame, + expected_errors: list, + expected_warnings: list, + generates_error: bool, + ): + mock_synapse_storage = Mock() + mock_synapse_storage.storageFileviewTable = TEST_DF_FILEVIEW + va_obj.synStore = mock_synapse_storage + with patch.object( + schematic.models.validate_attribute.ValidateAttribute, + "_login", + ), patch.object( + mock_synapse_storage, "reset_index", return_value=TEST_DF_FILEVIEW + ), patch.object( + schematic.models.validate_attribute.GenerateError, + "generate_filename_error", + return_value=( + expected_errors if len(expected_errors) < 1 else expected_errors[0], + expected_warnings, + ), + ) as mock_generate_filename_error: + actual_errors, actual_warnings = va_obj.filename_validation( + val_rule="filenameExists syn61682648", + manifest=manifest, + access_token="test_access_token", + dataset_scope="syn1", + ) + mock_generate_filename_error.assert_called_once() if generates_error else mock_generate_filename_error.assert_not_called() + assert (actual_errors, actual_warnings) == ( + expected_errors, + expected_warnings, + ) + + def test_filename_validation_null_dataset_scope(self, va_obj: ValidateAttribute): + with pytest.raises( + ValueError, + match="A dataset is required to be specified for filename validation", + ): + va_obj.filename_validation( + val_rule="filenameExists syn61682648", + manifest=TEST_MANIFEST_GOOD, + access_token="test_access_token", + dataset_scope=None, + ) + ######################################### # _run_validation_across_target_manifests #########################################