diff --git a/.github/workflows/api_test.yml b/.github/workflows/api_test.yml index 0dea21710..3e48bd09a 100644 --- a/.github/workflows/api_test.yml +++ b/.github/workflows/api_test.yml @@ -13,7 +13,7 @@ on: jobs: test: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 env: POETRY_VERSION: 1.3.0 strategy: diff --git a/.github/workflows/pdoc.yml b/.github/workflows/pdoc.yml index 818bd3a61..d24ef8652 100644 --- a/.github/workflows/pdoc.yml +++ b/.github/workflows/pdoc.yml @@ -25,7 +25,7 @@ concurrency: cancel-in-progress: true jobs: build: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 env: POETRY_VERSION: 1.3.0 PYTHON_VERSION: "3.10" diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 5b85c995f..8f2e686ea 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -7,9 +7,10 @@ on: jobs: pypi_release: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 env: POETRY_VERSION: 1.3.0 + PYTHON_VERSION: "3.10" if: github.event_name == 'push' && contains(github.ref, 'refs/tags') steps: #---------------------------------------------- @@ -18,10 +19,10 @@ jobs: - name: Check out repository uses: actions/checkout@v4 - - name: Set up Python ${{ matrix.python-version }} + - name: Set up Python ${{ env.PYTHON_VERSION }} uses: actions/setup-python@v5 with: - python-version: ${{ matrix.python-version }} + python-version: ${{ env.PYTHON_VERSION }} #---------------------------------------------- # install & configure poetry @@ -48,7 +49,7 @@ jobs: - name: Get current pushed tag run: | echo "RELEASE_VERSION=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV - echo ${{ env.RELEASE_VERSION }} + echo "$RELEASE_VERSION" #---------------------------------------------- # override version tag diff --git a/.github/workflows/scan_repo.yml b/.github/workflows/scan_repo.yml index 582c68f45..434b4b522 100644 --- a/.github/workflows/scan_repo.yml +++ b/.github/workflows/scan_repo.yml @@ -12,6 +12,9 @@ jobs: trivy: name: Trivy runs-on: ubuntu-latest + env: + TRIVY_DB_REPOSITORY: public.ecr.aws/aquasecurity/trivy-db:2 + TRIVY_JAVA_DB_REPOSITORY: public.ecr.aws/aquasecurity/trivy-java-db:1 steps: - name: Checkout code uses: actions/checkout@v4 diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 535d7804f..0090d1558 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -25,7 +25,7 @@ concurrency: cancel-in-progress: true jobs: test: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 env: POETRY_VERSION: 1.3.0 strategy: @@ -127,11 +127,31 @@ jobs: #---------------------------------------------- # run integration test suite #---------------------------------------------- + + - name: Retrieve telemetry access token from IDP + if: ${{ contains(fromJSON('["3.10"]'), matrix.python-version) }} + id: retrieve-telemetry-access-token + run: | + response=$(curl --request POST \ + --url ${{ vars.TELEMETRY_AUTH_CLIENT_URL }} \ + --header 'content-type: application/json' \ + --data '{"client_id":"${{ vars.TELEMETRY_AUTH_CLIENT_ID }}","client_secret":"${{ secrets.TELEMETRY_AUTH_CLIENT_SECRET }}","audience":"${{ vars.TELEMETRY_AUTH_AUDIENCE }}","grant_type":"client_credentials"}') + access_token=$(echo $response | jq -r .access_token) + echo "::add-mask::$access_token" + echo "TELEMETRY_ACCESS_TOKEN=$access_token" >> "$GITHUB_OUTPUT" - name: Run integration tests if: ${{ contains(fromJSON('["3.10"]'), matrix.python-version) }} env: SYNAPSE_ACCESS_TOKEN: ${{ secrets.SYNAPSE_ACCESS_TOKEN }} SERVICE_ACCOUNT_CREDS: ${{ secrets.SERVICE_ACCOUNT_CREDS }} + OTEL_EXPORTER_OTLP_HEADERS: "Authorization=Bearer ${{ steps.retrieve-telemetry-access-token.outputs.TELEMETRY_ACCESS_TOKEN }}" + DEPLOYMENT_ENVIRONMENT: ${{ vars.DEPLOYMENT_ENVIRONMENT }} + OTEL_EXPORTER_OTLP_ENDPOINT: ${{ vars.OTEL_EXPORTER_OTLP_ENDPOINT }} + TRACING_EXPORT_FORMAT: ${{ vars.TRACING_EXPORT_FORMAT }} + LOGGING_EXPORT_FORMAT: ${{ vars.LOGGING_EXPORT_FORMAT }} + TRACING_SERVICE_NAME: ${{ vars.TRACING_SERVICE_NAME }} + LOGGING_SERVICE_NAME: ${{ vars.LOGGING_SERVICE_NAME }} + SERVICE_INSTANCE_ID: ${{ github.head_ref || github.ref_name }} run: > poetry run pytest --durations=0 --cov-append --cov-report=term --cov-report=html:htmlcov --cov-report=xml:coverage.xml --cov=schematic/ -m "not (rule_benchmark or single_process_execution)" --reruns 4 -n 8 --ignore=tests/unit @@ -141,6 +161,13 @@ jobs: env: SYNAPSE_ACCESS_TOKEN: ${{ secrets.SYNAPSE_ACCESS_TOKEN }} SERVICE_ACCOUNT_CREDS: ${{ secrets.SERVICE_ACCOUNT_CREDS }} + OTEL_EXPORTER_OTLP_HEADERS: "Authorization=Bearer ${{ steps.retrieve-telemetry-access-token.outputs.TELEMETRY_ACCESS_TOKEN }}" + DEPLOYMENT_ENVIRONMENT: ${{ vars.DEPLOYMENT_ENVIRONMENT }} + OTEL_EXPORTER_OTLP_ENDPOINT: ${{ vars.OTEL_EXPORTER_OTLP_ENDPOINT }} + TRACING_EXPORT_FORMAT: ${{ vars.TRACING_EXPORT_FORMAT }} + LOGGING_EXPORT_FORMAT: ${{ vars.LOGGING_EXPORT_FORMAT }} + TRACING_SERVICE_NAME: ${{ vars.TRACING_SERVICE_NAME }} + LOGGING_SERVICE_NAME: ${{ vars.LOGGING_SERVICE_NAME }} run: > poetry run pytest --durations=0 --cov-append --cov-report=term --cov-report=html:htmlcov --cov-report=xml:coverage.xml --cov=schematic/ -m "single_process_execution" --reruns 4 --ignore=tests/unit diff --git a/CONTRIBUTION.md b/CONTRIBUTION.md index 4d8646e6c..974898f03 100644 --- a/CONTRIBUTION.md +++ b/CONTRIBUTION.md @@ -6,7 +6,7 @@ Please note we have a [code of conduct](CODE_OF_CONDUCT.md), please follow it in ## How to report bugs or feature requests -You can **create bug and feature requests** through [Sage Bionetwork's FAIR Data service desk](https://sagebionetworks.jira.com/servicedesk/customer/portal/5/group/8). Providing enough details to the developers to verify and troubleshoot your issue is paramount: +You can **create bug and feature requests** through [Sage Bionetwork's DPE schematic support](https://sagebionetworks.jira.com/servicedesk/customer/portal/5/group/7/create/225). Providing enough details to the developers to verify and troubleshoot your issue is paramount: - **Provide a clear and descriptive title as well as a concise summary** of the issue to identify the problem. - **Describe the exact steps which reproduce the problem** in as many details as possible. - **Describe the behavior you observed after following the steps** and point out what exactly is the problem with that behavior. @@ -25,7 +25,7 @@ For new features, bugs, enhancements: #### 1. Branch Setup * Pull the latest code from the develop branch in the upstream repository. -* Checkout a new branch formatted like so: `develop-` from the develop branch +* Checkout a new branch formatted like so: `-` from the develop branch #### 2. Development Workflow * Develop on your new branch. @@ -35,11 +35,11 @@ For new features, bugs, enhancements: * You can choose to create a draft PR if you prefer to develop this way #### 3. Branch Management -* Push code to `develop-` in upstream repo: +* Push code to `-` in upstream repo: ``` - git push develop- + git push - ``` -* Branch off `develop-` if you need to work on multiple features associated with the same code base +* Branch off `-` if you need to work on multiple features associated with the same code base * After feature work is complete and before creating a PR to the develop branch in upstream a. ensure that code runs locally b. test for logical correctness locally @@ -47,10 +47,10 @@ For new features, bugs, enhancements: c. wait for git workflow to complete (e.g. tests are run) on github #### 4. Pull Request and Review -* Create a PR from `develop-` into the develop branch of the upstream repo +* Create a PR from `-` into the develop branch of the upstream repo * Request a code review on the PR * Once code is approved merge in the develop branch. The **"Squash and merge"** strategy should be used for a cleaner commit history on the `develop` branch. The description of the squash commit should include enough information to understand the context of the changes that were made. -* Once the actions pass on the main branch, delete the `develop-` branch +* Once the actions pass on the main branch, delete the `-` branch ### Updating readthedocs documentation 1. Navigate to the docs directory. diff --git a/Dockerfile b/Dockerfile index 4a5ea43f9..7653ee335 100644 --- a/Dockerfile +++ b/Dockerfile @@ -29,4 +29,4 @@ RUN poetry install --no-interaction --no-ansi --no-root COPY . ./ -RUN poetry install --only-root \ No newline at end of file +RUN poetry install --only-root diff --git a/env.example b/env.example index f1f56a8a4..bf999908b 100644 --- a/env.example +++ b/env.example @@ -11,6 +11,8 @@ SERVICE_ACCOUNT_CREDS='Provide service account creds' # LOGGING_EXPORT_FORMAT=otlp # TRACING_SERVICE_NAME=schematic-api # LOGGING_SERVICE_NAME=schematic-api +## Instance ID is used during integration tests export to identify the git branch +# SERVICE_INSTANCE_ID=schematic-1234 ## Other examples: dev, staging, prod # DEPLOYMENT_ENVIRONMENT=local # OTEL_EXPORTER_OTLP_ENDPOINT=https://..../telemetry diff --git a/pyproject.toml b/pyproject.toml index 39315d360..4a75a7ad3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "schematicpy" -version = "24.11.2" +version = "24.12.1" description = "Package for biomedical data model and metadata ingress management" authors = [ "Milen Nikolov ", diff --git a/schematic/__init__.py b/schematic/__init__.py index 718feb6a3..a43f53553 100644 --- a/schematic/__init__.py +++ b/schematic/__init__.py @@ -10,7 +10,13 @@ from opentelemetry.instrumentation.flask import FlaskInstrumentor from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler from opentelemetry.sdk._logs.export import BatchLogRecordProcessor -from opentelemetry.sdk.resources import DEPLOYMENT_ENVIRONMENT, SERVICE_NAME, Resource +from opentelemetry.sdk.resources import ( + DEPLOYMENT_ENVIRONMENT, + SERVICE_INSTANCE_ID, + SERVICE_NAME, + SERVICE_VERSION, + Resource, +) from opentelemetry.sdk.trace import TracerProvider from opentelemetry.sdk.trace.export import BatchSpanProcessor, Span from opentelemetry.sdk.trace.sampling import ALWAYS_OFF @@ -20,6 +26,7 @@ from schematic.configuration.configuration import CONFIG from schematic.loader import LOADER +from schematic.version import __version__ from schematic_api.api.security_controller import info_from_bearer_auth Synapse.allow_client_caching(False) @@ -91,16 +98,14 @@ def set_up_tracing(session: requests.Session) -> None: Synapse.enable_open_telemetry(True) tracing_service_name = os.environ.get("TRACING_SERVICE_NAME", "schematic-api") deployment_environment = os.environ.get("DEPLOYMENT_ENVIRONMENT", "") + service_instance_id = os.environ.get("SERVICE_INSTANCE_ID", "") trace.set_tracer_provider( TracerProvider( resource=Resource( attributes={ + SERVICE_INSTANCE_ID: service_instance_id, SERVICE_NAME: tracing_service_name, - # TODO: Revisit this portion later on. As of 11/12/2024 when - # deploying this to ECS or running within a docker container, - # the package version errors out with the following error: - # importlib.metadata.PackageNotFoundError: No package metadata was found for schematicpy - # SERVICE_VERSION: package_version, + SERVICE_VERSION: __version__, DEPLOYMENT_ENVIRONMENT: deployment_environment, } ) @@ -122,11 +127,14 @@ def set_up_logging(session: requests.Session) -> None: logging_export = os.environ.get("LOGGING_EXPORT_FORMAT", None) logging_service_name = os.environ.get("LOGGING_SERVICE_NAME", "schematic-api") deployment_environment = os.environ.get("DEPLOYMENT_ENVIRONMENT", "") + service_instance_id = os.environ.get("SERVICE_INSTANCE_ID", "") if logging_export == "otlp": resource = Resource.create( { + SERVICE_INSTANCE_ID: service_instance_id, SERVICE_NAME: logging_service_name, DEPLOYMENT_ENVIRONMENT: deployment_environment, + SERVICE_VERSION: __version__, } ) diff --git a/schematic/__main__.py b/schematic/__main__.py index 22216f273..094624d7e 100644 --- a/schematic/__main__.py +++ b/schematic/__main__.py @@ -13,6 +13,7 @@ from schematic.visualization.commands import ( viz as viz_cli, ) # viz generation commands +from schematic import __version__ logger = logging.getLogger() click_log.basic_config(logger) @@ -24,6 +25,7 @@ # invoke_without_command=True -> forces the application not to show aids before losing them with a --h @click.group(context_settings=CONTEXT_SETTINGS, invoke_without_command=True) @click_log.simple_verbosity_option(logger) +@click.version_option(version=__version__, prog_name="schematic") def main(): """ Command line interface to the `schematic` backend services. diff --git a/schematic/manifest/generator.py b/schematic/manifest/generator.py index 69b86e136..981e6d8e3 100644 --- a/schematic/manifest/generator.py +++ b/schematic/manifest/generator.py @@ -1904,6 +1904,8 @@ def get_manifest( # TODO: avoid explicitly exposing Synapse store functionality # just instantiate a Store class and let it decide at runtime/config # the store type + # TODO: determine which parts of fileview are necessary for `get` operations + # and pass query parameters at object instantiation to avoid having to re-query if access_token: # for getting an existing manifest on AWS store = SynapseStorage(access_token=access_token) diff --git a/schematic/models/validate_attribute.py b/schematic/models/validate_attribute.py index 9b13bebaf..69eea3fab 100644 --- a/schematic/models/validate_attribute.py +++ b/schematic/models/validate_attribute.py @@ -17,6 +17,7 @@ from schematic.schemas.data_model_graph import DataModelGraphExplorer from schematic.store.synapse import SynapseStorage +from schematic.utils.df_utils import read_csv from schematic.utils.validate_rules_utils import validation_rule_info from schematic.utils.validate_utils import ( comma_separated_list_regex, @@ -868,7 +869,7 @@ def _get_target_manifest_dataframes( entity: File = self.synStore.getDatasetManifest( datasetId=dataset_id, downloadFile=True ) - manifests.append(pd.read_csv(entity.path)) + manifests.append(read_csv(entity.path)) return dict(zip(manifest_ids, manifests)) def get_target_manifests( @@ -2119,7 +2120,9 @@ def filename_validation( where_clauses = [] - dataset_clause = f"parentId='{dataset_scope}'" + dataset_clause = SynapseStorage.build_clause_from_dataset_id( + dataset_id=dataset_scope + ) where_clauses.append(dataset_clause) self._login( diff --git a/schematic/store/database/synapse_database_wrapper.py b/schematic/store/database/synapse_database_wrapper.py index b827b140f..ba0ed2dc9 100644 --- a/schematic/store/database/synapse_database_wrapper.py +++ b/schematic/store/database/synapse_database_wrapper.py @@ -8,6 +8,7 @@ from opentelemetry import trace from schematic.store.synapse_tracker import SynapseEntityTracker +from schematic.utils.df_utils import read_csv class SynapseTableNameError(Exception): @@ -108,7 +109,7 @@ def execute_sql_query( pandas.DataFrame: The queried table """ result = self.execute_sql_statement(query, include_row_data) - table = pandas.read_csv(result.filepath) + table = read_csv(result.filepath) return table def execute_sql_statement( diff --git a/schematic/store/synapse.py b/schematic/store/synapse.py index 7ccd98362..93a7140f2 100644 --- a/schematic/store/synapse.py +++ b/schematic/store/synapse.py @@ -19,12 +19,10 @@ import numpy as np import pandas as pd import synapseclient -import synapseutils from opentelemetry import trace from synapseclient import Annotations as OldAnnotations from synapseclient import ( Column, - Entity, EntityViewSchema, EntityViewType, File, @@ -58,7 +56,12 @@ from schematic.store.base import BaseStorage from schematic.store.database.synapse_database import SynapseDatabase from schematic.store.synapse_tracker import SynapseEntityTracker -from schematic.utils.df_utils import col_in_dataframe, load_df, update_df +from schematic.utils.df_utils import ( + STR_NA_VALUES_FILTERED, + col_in_dataframe, + load_df, + update_df, +) # entity_type_mapping, get_dir_size, create_temp_folder, check_synapse_cache_size, and clear_synapse_cache functions are used for AWS deployment # Please do not remove these import statements @@ -401,7 +404,7 @@ def query_fileview( try: self.storageFileviewTable = self.syn.tableQuery( query=self.fileview_query, - ).asDataFrame() + ).asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False) except SynapseHTTPError as exc: exception_text = str(exc) if "Unknown column path" in exception_text: @@ -416,6 +419,30 @@ def query_fileview( else: raise AccessCredentialsError(self.storageFileview) + @staticmethod + def build_clause_from_dataset_id( + dataset_id: Optional[str] = None, dataset_folder_list: Optional[list] = None + ) -> str: + """ + Method to build a where clause for a Synapse FileView query based on a dataset ID that can be used before an object is initialized. + Args: + dataset_id: Synapse ID of a dataset that should be used to limit the query + dataset_folder_list: List of Synapse IDs of a dataset and all its subfolders that should be used to limit the query + Returns: + clause for the query or an empty string if no dataset ID is provided + """ + # Calling this method without specifying synIDs will complete but will not scope the view + if (not dataset_id) and (not dataset_folder_list): + return "" + + # This will be used to gather files under a dataset recursively with a fileview query instead of walking + if dataset_folder_list: + search_folders = ", ".join(f"'{synId}'" for synId in dataset_folder_list) + return f"parentId IN ({search_folders})" + + # `dataset_id` should be provided when all files are stored directly under the dataset folder + return f"parentId='{dataset_id}'" + def _build_query( self, columns: Optional[list] = None, where_clauses: Optional[list] = None ): @@ -666,7 +693,7 @@ def getStorageDatasetsInProject(self, projectId: str) -> list[tuple[str, str]]: def getFilesInStorageDataset( self, datasetId: str, fileNames: List = None, fullpath: bool = True ) -> List[Tuple[str, str]]: - """Gets all files in a given dataset folder. + """Gets all files (excluding manifest files) in a given dataset folder. Args: datasetId: synapse ID of a storage dataset. @@ -680,105 +707,58 @@ def getFilesInStorageDataset( Raises: ValueError: Dataset ID not found. """ - # select all files within a given storage dataset folder (top level folder in - # a Synapse storage project or folder marked with contentType = 'dataset') - walked_path = synapseutils.walk( - self.syn, datasetId, includeTypes=["folder", "file"] - ) - - current_entity_location = self.synapse_entity_tracker.get( - synapse_id=datasetId, syn=self.syn, download_file=False - ) - - def walk_back_to_project( - current_location: Entity, location_prefix: str, skip_entry: bool - ) -> str: - """ - Recursively walk back up the project structure to get the paths of the - names of each of the directories where we started the walk function. - - Args: - current_location (Entity): The current entity location in the project structure. - location_prefix (str): The prefix to prepend to the path. - skip_entry (bool): Whether to skip the current entry in the path. When - this is True it means we are looking at our starting point. If our - starting point is the project itself we can go ahead and return - back the project as the prefix. - - Returns: - str: The path of the names of each of the directories up to the project root. - """ - if ( - skip_entry - and "concreteType" in current_location - and current_location["concreteType"] == PROJECT_ENTITY - ): - return f"{current_location.name}/{location_prefix}" + file_list = [] - updated_prefix = ( - location_prefix - if skip_entry - else f"{current_location.name}/{location_prefix}" - ) - if ( - "concreteType" in current_location - and current_location["concreteType"] == PROJECT_ENTITY - ): - return updated_prefix - current_location = self.synapse_entity_tracker.get( - synapse_id=current_location["parentId"], - syn=self.syn, - download_file=False, + # Get path to dataset folder by using childern to avoid cases where the dataset is the scope of the view + if self.storageFileviewTable.empty: + raise ValueError( + f"Fileview {self.storageFileview} is empty, please check the table and the provided synID and try again." ) - return walk_back_to_project( - current_location=current_location, - location_prefix=updated_prefix, - skip_entry=False, + + child_path = self.storageFileviewTable.loc[ + self.storageFileviewTable["parentId"] == datasetId, "path" + ] + if child_path.empty: + raise LookupError( + f"Dataset {datasetId} could not be found in fileview {self.storageFileview}." ) + child_path = child_path.iloc[0] - prefix = walk_back_to_project( - current_location=current_entity_location, - location_prefix="", - skip_entry=True, - ) + # Get the dataset path by eliminating the child's portion of the path to account for nested datasets + parent = child_path.split("/")[:-1] + parent = "/".join(parent) - project_id = self.getDatasetProject(datasetId) - project = self.synapse_entity_tracker.get( - synapse_id=project_id, syn=self.syn, download_file=False - ) - project_name = project.name - file_list = [] + # Format dataset path to be used in table query + dataset_path = f"'{parent}/%'" - # iterate over all results - for dirpath, _, path_filenames in walked_path: - # iterate over all files in a folder - for path_filename in path_filenames: - if ("manifest" not in path_filename[0] and not fileNames) or ( - fileNames and path_filename[0] in fileNames - ): - # don't add manifest to list of files unless it is specified in the - # list of specified fileNames; return all found files - # except the manifest if no fileNames have been specified - # TODO: refactor for clarity/maintainability - - if fullpath: - # append directory path to filename - if dirpath[0].startswith(f"{project_name}/"): - path_without_project_prefix = ( - dirpath[0] + "/" - ).removeprefix(f"{project_name}/") - path_filename = ( - prefix + path_without_project_prefix + path_filename[0], - path_filename[1], - ) - else: - path_filename = ( - prefix + dirpath[0] + "/" + path_filename[0], - path_filename[1], - ) + # When querying, only include files to exclude entity files and subdirectories + where_clauses = [f"path like {dataset_path}", "type='file'"] + + # Requery the fileview to specifically get the files in the given dataset + self.query_fileview(columns=["id", "path"], where_clauses=where_clauses) + + # Exclude manifest files + non_manifest_files = self.storageFileviewTable.loc[ + ~self.storageFileviewTable["path"].str.contains("synapse_storage_manifest"), + :, + ] + + # Remove all files that are not in the list of fileNames + if fileNames: + filename_regex = "|".join(fileNames) + + matching_files = non_manifest_files["path"].str.contains( + filename_regex, case=False, regex=True + ) + + non_manifest_files = non_manifest_files.loc[matching_files, :] - # add file name file id tuple, rearranged so that id is first and name follows - file_list.append(path_filename[::-1]) + # Truncate path if necessary + if not fullpath: + non_manifest_files.path = non_manifest_files.path.apply(os.path.basename) + + # Return list of files as expected by other methods + file_list = list(non_manifest_files.itertuples(index=False, name=None)) return file_list @@ -1056,7 +1036,7 @@ def updateDatasetManifestFiles( Returns: Synapse ID of updated manifest and Pandas dataframe containing the updated manifest. - If there is no existing manifest return None + If there is no existing manifest or if the manifest does not have an entityId column, return None """ # get existing manifest Synapse ID @@ -1070,8 +1050,12 @@ def updateDatasetManifestFiles( synapse_id=manifest_id, syn=self.syn, download_file=True ) manifest_filepath = manifest_entity.path - manifest = load_df(manifest_filepath) + + # If the manifest does not have an entityId column, trigger a new manifest to be generated + if "entityId" not in manifest.columns: + return None + manifest_is_file_based = "Filename" in manifest.columns if manifest_is_file_based: @@ -1079,7 +1063,6 @@ def updateDatasetManifestFiles( # note that if there is an existing manifest and there are files in the dataset # the columns Filename and entityId are assumed to be present in manifest schema # TODO: use idiomatic panda syntax - dataset_files, manifest = self.fill_in_entity_id_filename( datasetId, manifest ) @@ -1119,6 +1102,13 @@ def _get_file_entityIds( "No manifest was passed in, a manifest is required when `only_new_files` is True." ) + if "entityId" not in manifest.columns: + raise ValueError( + "The manifest in your dataset and/or top level folder must contain the 'entityId' column. " + "Please generate an empty manifest without annotations, manually add annotations to the " + "appropriate files in the manifest, and then try again." + ) + # find new files (that are not in the current manifest) if any for file_id, file_name in dataset_files: if not file_id in manifest["entityId"].values: @@ -1433,7 +1423,11 @@ def get_synapse_table(self, synapse_id: str) -> Tuple[pd.DataFrame, CsvFileTable """ results = self.syn.tableQuery("SELECT * FROM {}".format(synapse_id)) - df = results.asDataFrame(rowIdAndVersionInIndex=False) + df = results.asDataFrame( + rowIdAndVersionInIndex=False, + na_values=STR_NA_VALUES_FILTERED, + keep_default_na=False, + ) return df, results @@ -3485,7 +3479,11 @@ def query(self, tidy=True, force=False): if self.table is None or force: fileview_id = self.view_schema["id"] self.results = self.synapse.tableQuery(f"select * from {fileview_id}") - self.table = self.results.asDataFrame(rowIdAndVersionInIndex=False) + self.table = self.results.asDataFrame( + rowIdAndVersionInIndex=False, + na_values=STR_NA_VALUES_FILTERED, + keep_default_na=False, + ) if tidy: self.tidy_table() return self.table diff --git a/schematic/utils/cli_utils.py b/schematic/utils/cli_utils.py index 07c97af90..c6412d349 100644 --- a/schematic/utils/cli_utils.py +++ b/schematic/utils/cli_utils.py @@ -4,10 +4,11 @@ # pylint: disable=anomalous-backslash-in-string import logging - -from typing import Any, Mapping, Sequence, Union, Optional -from functools import reduce import re +from functools import reduce +from typing import Any, Mapping, Optional, Sequence, Union + +from schematic.utils.general import SYN_ID_REGEX logger = logging.getLogger(__name__) @@ -69,7 +70,7 @@ def parse_syn_ids( if not syn_ids: return None - project_regex = re.compile("(syn\d+\,?)+") + project_regex = re.compile(SYN_ID_REGEX) valid = project_regex.fullmatch(syn_ids) if not valid: diff --git a/schematic/utils/df_utils.py b/schematic/utils/df_utils.py index b25e7db82..ee458ba17 100644 --- a/schematic/utils/df_utils.py +++ b/schematic/utils/df_utils.py @@ -4,17 +4,58 @@ import logging from copy import deepcopy -from time import perf_counter -from typing import Union, Any, Optional from datetime import datetime +from time import perf_counter +from typing import Any, Optional, Union + import dateparser as dp -import pandas as pd import numpy as np +import pandas as pd from pandarallel import pandarallel # type: ignore +# pylint:disable=no-name-in-module +from pandas._libs.parsers import STR_NA_VALUES # type: ignore + +STR_NA_VALUES_FILTERED = deepcopy(STR_NA_VALUES) + +try: + STR_NA_VALUES_FILTERED.remove("None") +except KeyError: + pass + logger = logging.getLogger(__name__) +def read_csv( + path_or_buffer: str, + keep_default_na: bool = False, + encoding: str = "utf8", + **load_args: Any, +) -> pd.DataFrame: + """ + A wrapper around pd.read_csv that filters out "None" from the na_values list. + + Args: + path_or_buffer: The path to the file or a buffer containing the file. + keep_default_na: Whether to keep the default na_values list. + encoding: The encoding of the file. + **load_args: Additional arguments to pass to pd.read_csv. + + Returns: + pd.DataFrame: The dataframe created from the CSV file or buffer. + """ + na_values = load_args.pop( + "na_values", STR_NA_VALUES_FILTERED if not keep_default_na else None + ) + return pd.read_csv( # type: ignore + path_or_buffer, + na_values=na_values, + keep_default_na=keep_default_na, + encoding=encoding, + **load_args, + ) + + def load_df( file_path: str, preserve_raw_input: bool = True, @@ -45,9 +86,7 @@ def load_df( t_load_df = perf_counter() # Read CSV to df as type specified in kwargs - org_df = pd.read_csv( # type: ignore - file_path, keep_default_na=True, encoding="utf8", **load_args - ) + org_df = read_csv(file_path, encoding="utf8", **load_args) # type: ignore if not isinstance(org_df, pd.DataFrame): raise ValueError( ( diff --git a/schematic/utils/general.py b/schematic/utils/general.py index 0bb932aa3..cdc1be2f2 100644 --- a/schematic/utils/general.py +++ b/schematic/utils/general.py @@ -5,7 +5,7 @@ import logging import os import pstats -import subprocess +from pathlib import Path import tempfile from cProfile import Profile from datetime import datetime, timedelta @@ -24,6 +24,8 @@ T = TypeVar("T") +SYN_ID_REGEX = r"(syn\d+\,?)+" + def find_duplicates(_list: list[T]) -> set[T]: """Find duplicate items in a list""" @@ -129,40 +131,19 @@ def calculate_datetime( return date_time_result -def check_synapse_cache_size( - directory: str = "/root/.synapseCache", -) -> float: - """use du --sh command to calculate size of .synapseCache. +def check_synapse_cache_size(directory: str = "/root/.synapseCache") -> float: + """Calculate size of .synapseCache directory in bytes using pathlib. Args: directory (str, optional): .synapseCache directory. Defaults to '/root/.synapseCache' Returns: - float: returns size of .synapsecache directory in bytes + float: size of .synapsecache directory in bytes """ - # Note: this command might fail on windows user. - # But since this command is primarily for running on AWS, it is fine. - command = ["du", "-sh", directory] - output = subprocess.run(command, capture_output=True, check=False).stdout.decode( - "utf-8" + total_size = sum( + f.stat().st_size for f in Path(directory).rglob("*") if f.is_file() ) - - # Parsing the output to extract the directory size - size = output.split("\t")[0] - if "K" in size: - size_in_kb = float(size.rstrip("K")) - byte_size = size_in_kb * 1000 - elif "M" in size: - size_in_mb = float(size.rstrip("M")) - byte_size = size_in_mb * 1000000 - elif "G" in size: - size_in_gb = float(size.rstrip("G")) - byte_size = size_in_gb * (1024**3) - elif "B" in size: - byte_size = float(size.rstrip("B")) - else: - logger.error("Cannot recognize the file size unit") - return byte_size + return total_size def clear_synapse_cache(synapse_cache: cache.Cache, minutes: int) -> int: diff --git a/schematic/version.py b/schematic/version.py index a9bdb1578..cec250131 100644 --- a/schematic/version.py +++ b/schematic/version.py @@ -1,4 +1,3 @@ """Sets the version of the package""" -import importlib.metadata - -__version__ = importlib.metadata.version("schematic") +# Version hardcoded see https://sagebionetworks.jira.com/browse/SCHEMATIC-229 +__version__ = "24.12.1" diff --git a/schematic_api/api/openapi/api.yaml b/schematic_api/api/openapi/api.yaml index 8689a8ae2..7c32e8d90 100644 --- a/schematic_api/api/openapi/api.yaml +++ b/schematic_api/api/openapi/api.yaml @@ -692,8 +692,8 @@ paths: - Synapse Storage /storage/dataset/files: get: - summary: Get all files in a given dataset folder - description: Get all files in a given dataset folder + summary: Get all files (excluding manifest files) in a given dataset folder + description: Get all files (excluding manifest files) in a given dataset folder operationId: schematic_api.api.routes.get_files_storage_dataset security: - access_token: [] diff --git a/schematic_api/api/routes.py b/schematic_api/api/routes.py index 97484c15c..ad1e25279 100644 --- a/schematic_api/api/routes.py +++ b/schematic_api/api/routes.py @@ -20,6 +20,7 @@ from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer from schematic.schemas.data_model_parser import DataModelParser from schematic.store.synapse import ManifestDownload, SynapseStorage +from schematic.utils.df_utils import read_csv from schematic.utils.general import create_temp_folder, entity_type_mapping from schematic.utils.schema_utils import ( DisplayLabelType, @@ -178,7 +179,7 @@ def parse_bool(str_bool): def return_as_json(manifest_local_file_path): - manifest_csv = pd.read_csv(manifest_local_file_path) + manifest_csv = read_csv(manifest_local_file_path) manifest_json = manifest_csv.to_dict(orient="records") return manifest_json diff --git a/tests/conftest.py b/tests/conftest.py index 2f9dd3047..d373e6ae6 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -13,6 +13,7 @@ from dotenv import load_dotenv from flask.testing import FlaskClient from opentelemetry import trace +from opentelemetry.instrumentation.flask import FlaskInstrumentor from synapseclient.client import Synapse from schematic.configuration.configuration import CONFIG, Configuration @@ -42,6 +43,8 @@ TESTS_DIR = os.path.dirname(os.path.abspath(__file__)) DATA_DIR = os.path.join(TESTS_DIR, "data") +FlaskInstrumentor().uninstrument() + @pytest.fixture(scope="session") def dataset_id(): diff --git a/tests/data/example.model.csv b/tests/data/example.model.csv index 7438c7145..1c2923535 100644 --- a/tests/data/example.model.csv +++ b/tests/data/example.model.csv @@ -10,7 +10,7 @@ Cancer Type,,"Breast, Colorectal, Lung, Prostate, Skin",,,TRUE,DataProperty,,, Family History,,"Breast, Colorectal, Lung, Prostate, Skin",,,TRUE,DataProperty,,,list strict Biospecimen,,,"Sample ID, Patient ID, Tissue Status, Component",,FALSE,DataType,Patient,, Sample ID,,,,,TRUE,DataProperty,,, -Tissue Status,,"Healthy, Malignant",,,TRUE,DataProperty,,, +Tissue Status,,"Healthy, Malignant, None",,,TRUE,DataProperty,,, Bulk RNA-seq Assay,,,"Filename, Sample ID, File Format, Component",,FALSE,DataType,Biospecimen,, Filename,,,,,TRUE,DataProperty,,,#MockFilename filenameExists^^ File Format,,"FASTQ, BAM, CRAM, CSV/TSV",,,TRUE,DataProperty,,, @@ -24,8 +24,8 @@ Check List,,,,,TRUE,DataProperty,,,list Check List Enum,,"ab, cd, ef, gh",,,TRUE,DataProperty,,,list Check List Like,,,,,TRUE,DataProperty,,,list like Check List Like Enum,,"ab, cd, ef, gh",,,TRUE,DataProperty,,,list like -Check List Strict,,,,,TRUE,DataProperty,,,list strict -Check List Enum Strict,,"ab, cd, ef, gh",,,TRUE,DataProperty,,,list strict +Check List Strict,,,,,TRUE,DataProperty,,,list strict +Check List Enum Strict,,"ab, cd, ef, gh",,,TRUE,DataProperty,,,list strict Check Regex List,,,,,TRUE,DataProperty,,,list::regex match [a-f] Check Regex List Strict,,,,,TRUE,DataProperty,,,list strict::regex match [a-f] Check Regex List Like,,,,,TRUE,DataProperty,,,list like::regex match [a-f] diff --git a/tests/data/example.model.jsonld b/tests/data/example.model.jsonld index c4279a605..7e4560d50 100644 --- a/tests/data/example.model.jsonld +++ b/tests/data/example.model.jsonld @@ -540,6 +540,9 @@ }, { "@id": "bts:Malignant" + }, + { + "@id": "bts:None" } ], "sms:displayName": "Tissue Status", @@ -563,6 +566,23 @@ "sms:required": "sms:false", "sms:validationRules": [] }, + { + "@id": "bts:None", + "@type": "rdfs:Class", + "rdfs:comment": "TBD", + "rdfs:label": "None", + "rdfs:subClassOf": [ + { + "@id": "bts:TissueStatus" + } + ], + "schema:isPartOf": { + "@id": "http://schema.biothings.io" + }, + "sms:displayName": "None", + "sms:required": "sms:false", + "sms:validationRules": [] + }, { "@id": "bts:BulkRNA-seqAssay", "@type": "rdfs:Class", diff --git a/tests/data/mock_manifests/Invalid_none_value_test_manifest.csv b/tests/data/mock_manifests/Invalid_none_value_test_manifest.csv new file mode 100644 index 000000000..8230442f3 --- /dev/null +++ b/tests/data/mock_manifests/Invalid_none_value_test_manifest.csv @@ -0,0 +1,6 @@ +Sample ID,Patient ID,Tissue Status,Component +1,1,Healthy,Biospecimen +2,2,Malignant,Biospecimen +3,3,None,Biospecimen +4,4,None,Biospecimen +5,5,InvalidValue,Biospecimen diff --git a/tests/data/mock_manifests/Valid_none_value_test_manifest.csv b/tests/data/mock_manifests/Valid_none_value_test_manifest.csv new file mode 100644 index 000000000..b7a0666d6 --- /dev/null +++ b/tests/data/mock_manifests/Valid_none_value_test_manifest.csv @@ -0,0 +1,6 @@ +Sample ID,Patient ID,Tissue Status,Component +1,1,Healthy,Biospecimen +2,2,Malignant,Biospecimen +3,3,None,Biospecimen +4,4,None,Biospecimen +5,5,None,Biospecimen \ No newline at end of file diff --git a/tests/integration/test_commands.py b/tests/integration/test_commands.py index 8658f6e6b..342736ab1 100644 --- a/tests/integration/test_commands.py +++ b/tests/integration/test_commands.py @@ -4,16 +4,16 @@ import uuid from io import BytesIO +import numpy as np import pytest import requests -from openpyxl import load_workbook from click.testing import CliRunner -import pandas as pd -import numpy as np +from openpyxl import load_workbook -from schematic.configuration.configuration import Configuration, CONFIG +from schematic.configuration.configuration import CONFIG, Configuration from schematic.manifest.commands import manifest from schematic.models.commands import model +from schematic.utils.df_utils import read_csv from tests.conftest import ConfigurationForTesting LIGHT_BLUE = "FFEAF7F9" # Required cell @@ -95,14 +95,14 @@ def test_validate_valid_manifest(self, runner: CliRunner) -> None: # command has no (python) errors, has exit code 0 assert result.exit_code == 0 # command output has success message - assert result.output.split("\n")[4] == ( + result_list = result.output.split("\n") + assert ( "Your manifest has been validated successfully. " "There are no errors in your manifest, " "and it can be submitted without any modifications." - ) + ) in result_list # command output has no validation errors - for line in result.output.split("\n")[4]: - assert not line.startswith("error") + errors = [errors for result in result_list if result.startswith("error")] def test_validate_invalid_manifest(self, runner: CliRunner) -> None: """ @@ -155,8 +155,8 @@ def test_generate_empty_csv_manifests(self, runner: CliRunner) -> None: # command has no (python) errors, has exit code 0 assert result.exit_code == 0 - biospecimen_df = pd.read_csv("tests/data/example.Biospecimen.manifest.csv") - patient_df = pd.read_csv("tests/data/example.Patient.manifest.csv") + biospecimen_df = read_csv("tests/data/example.Biospecimen.manifest.csv") + patient_df = read_csv("tests/data/example.Patient.manifest.csv") # Remove created files: finally: @@ -339,7 +339,7 @@ def test_generate_empty_google_sheet_manifests( assert False, f"Unexpected data validation found: {dv}" assert tissue_status_validation is not None assert tissue_status_validation.type == "list" - assert tissue_status_validation.formula1 == "Sheet2!$C$2:$C$3" + assert tissue_status_validation.formula1 == "Sheet2!$C$2:$C$4" # required fields are marked as “light blue”, while other non-required fields are marked as white. for col in ["Sample ID", "Patient ID", "Tissue Status", "Component"]: @@ -504,9 +504,10 @@ def test_generate_empty_excel_manifest( os.remove("tests/data/example.Biospecimen.schema.json") # command output has excel file creation message + result_list = result.output.split("\n") assert ( - result.output.split("\n")[7] - == "Find the manifest template using this Excel file path: ./CLI_empty_excel.xlsx" + "Find the manifest template using this Excel file path: ./CLI_empty_excel.xlsx" + in result_list ) sheet1 = workbook["Sheet1"] @@ -665,18 +666,19 @@ def test_generate_bulk_rna_google_sheet_manifest( # Reset config to it's default values CONFIG.load_config("config_example.yml") - assert result.output.split("\n")[7] == ( - "Find the manifest template using this Google Sheet URL:" - ) - assert result.output.split("\n")[8].startswith( - "https://docs.google.com/spreadsheets/d/" - ) - assert result.output.split("\n")[9] == ( + result_list = result.output.split("\n") + assert "Find the manifest template using this Google Sheet URL:" in result_list + assert ( "Find the manifest template using this CSV file path: " "./CLI_gs_bulk_rna.csv" - ) - - google_sheet_url = result.output.split("\n")[8] + ) in result_list + google_sheet_result = [ + result + for result in result_list + if result.startswith("https://docs.google.com/spreadsheets/d/") + ] + assert len(google_sheet_result) == 1 + google_sheet_url = google_sheet_result[0] # Download the Google Sheets content as an Excel file and load into openpyxl export_url = f"{google_sheet_url}/export?format=xlsx" @@ -908,18 +910,19 @@ def test_generate_bulk_rna_google_sheet_manifest_with_annotations( os.remove("tests/data/example.BulkRNA-seqAssay.schema.json") os.remove("./CLI_gs_bulk_rna_annos.csv") - assert result.output.split("\n")[10] == ( - "Find the manifest template using this Google Sheet URL:" - ) - assert result.output.split("\n")[11].startswith( - "https://docs.google.com/spreadsheets/d/" - ) - assert result.output.split("\n")[12] == ( + result_list = result.output.split("\n") + assert "Find the manifest template using this Google Sheet URL:" in result_list + assert ( "Find the manifest template using this CSV file path: " "./CLI_gs_bulk_rna_annos.csv" - ) - - google_sheet_url = result.output.split("\n")[11] + ) in result_list + google_sheet_result = [ + result + for result in result_list + if result.startswith("https://docs.google.com/spreadsheets/d/") + ] + assert len(google_sheet_result) == 1 + google_sheet_url = google_sheet_result[0] # Download the Google Sheets content as an Excel file and load into openpyxl export_url = f"{google_sheet_url}/export?format=xlsx" @@ -1177,10 +1180,11 @@ def test_generate_mock_component_excel_manifest(self, runner: CliRunner) -> None # TODO: remove with https://sagebionetworks.jira.com/browse/SCHEMATIC-202 # Reset config to it's default values CONFIG.load_config("config_example.yml") - # Command output has excel file message - assert result.output.split("\n")[8] == ( + result_list = result.output.split("\n") + assert ( "Find the manifest template using this Excel file path: ./CLI_mock_comp.xlsx" + in result_list ) sheet1 = workbook["Sheet1"] diff --git a/tests/integration/test_manifest_submission.py b/tests/integration/test_manifest_submission.py index 92e6911c1..84581f7d8 100644 --- a/tests/integration/test_manifest_submission.py +++ b/tests/integration/test_manifest_submission.py @@ -4,7 +4,6 @@ import uuid from typing import Any, Callable, Dict -import pandas as pd import pytest import requests from flask.testing import FlaskClient @@ -12,6 +11,7 @@ from schematic.configuration.configuration import CONFIG from schematic.store.synapse import SynapseStorage +from schematic.utils.df_utils import read_csv from tests.conftest import ConfigurationForTesting, Helpers from tests.utils import CleanupItem @@ -73,7 +73,7 @@ def validate_submitted_manifest_file( manifest_file_path = os.path.join( download_location, manifest_data["properties"]["name"] ) - manifest_submitted_df = pd.read_csv(manifest_file_path) + manifest_submitted_df = read_csv(manifest_file_path) assert "entityId" in manifest_submitted_df.columns assert "Id" in manifest_submitted_df.columns @@ -1035,6 +1035,11 @@ def test_submit_manifest_with_hide_blanks( randomized_annotation_content = str(uuid.uuid4()) df["RandomizedAnnotation"] = randomized_annotation_content + # AND a "None" string remains in the manifest + df["NoneString"] = "None" + df["NoneString1"] = "none" + df["NoneString2"] = "NoNe" + with tempfile.NamedTemporaryFile(delete=True, suffix=".csv") as tmp_file: # Write the DF to a temporary file df.to_csv(tmp_file.name, index=False) @@ -1070,6 +1075,9 @@ def test_submit_manifest_with_hide_blanks( modified_file = syn.get(df["entityId"][0], downloadFile=False) assert modified_file is not None assert modified_file["RandomizedAnnotation"][0] == randomized_annotation_content + assert modified_file["NoneString"][0] == "None" + assert modified_file["NoneString1"][0] == "none" + assert modified_file["NoneString2"][0] == "NoNe" # AND the blank annotations are not present assert "Genome Build" not in modified_file diff --git a/tests/integration/test_metadata_model.py b/tests/integration/test_metadata_model.py index 2178a83b8..21da9cfd8 100644 --- a/tests/integration/test_metadata_model.py +++ b/tests/integration/test_metadata_model.py @@ -1,5 +1,5 @@ """ -This script contains a test suite for verifying the submission and annotation of +This script contains a test suite for verifying the validation, submission and annotation of file-based manifests using the `TestMetadataModel` class to communicate with Synapse and verify the expected behavior of uploading annotation manifest CSVs using the metadata model. @@ -13,7 +13,7 @@ import tempfile import uuid from contextlib import nullcontext as does_not_raise -from typing import Callable, Optional +from typing import Any, Callable, Optional import pandas as pd import pytest @@ -23,6 +23,7 @@ from synapseclient.models import File, Folder from schematic.store.synapse import SynapseStorage +from schematic.utils.df_utils import STR_NA_VALUES_FILTERED from schematic.utils.general import create_temp_folder from tests.conftest import Helpers, metadata_model from tests.utils import CleanupItem @@ -139,7 +140,6 @@ async def test_submit_filebased_manifest_file_and_entities_valid_manifest_submit dir=create_temp_folder(path=tempfile.gettempdir()), ) as tmp_file: df.to_csv(tmp_file.name, index=False) - # WHEN the manifest is submitted (Assertions are handled in the helper method) self._submit_and_verify_manifest( helpers=helpers, @@ -229,7 +229,6 @@ async def test_submit_filebased_manifest_file_and_entities_mock_filename( dir=create_temp_folder(path=tempfile.gettempdir()), ) as tmp_file: df.to_csv(tmp_file.name, index=False) - # WHEN the manifest is submitted (Assertions are handled in the helper method) self._submit_and_verify_manifest( helpers=helpers, @@ -450,6 +449,11 @@ def _submit_and_verify_manifest( raise ValueError( "expected_manifest_id or expected_manifest_name must be provided" ) + # HACK: must requery the fileview to get new files, since SynapseStorage will query the last state + # of the fileview which may not contain any new folders in the fileview. + # This is a workaround to fileviews not always containing the latest information + # Since the tests don't always follow a similar process as testing resources are created and destroyed + synapse_store.query_fileview(force_requery=True) # Spies if already_spied: @@ -531,7 +535,7 @@ def _submit_and_verify_manifest( ) manifest_table = synapse_store.syn.tableQuery( f"select * from {expected_table_id}", downloadLocation=download_dir - ).asDataFrame() + ).asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False) # AND the columns in the manifest table should reflect the ones in the file table_columns = manifest_table.columns @@ -566,3 +570,46 @@ def _submit_and_verify_manifest( spy_upload_file_as_table.call_count == 1 spy_upload_file_as_csv.assert_not_called() spy_upload_file_combo.assert_not_called() + + def test_validate_model_manifest_valid_with_none_string( + self, helpers: Helpers + ) -> None: + """ + Tests for validateModelManifest when the manifest is valid with 'None' values + + Args: + helpers: Test helper functions + """ + meta_data_model = metadata_model(helpers, "class_label") + + errors, warnings = meta_data_model.validateModelManifest( + manifestPath="tests/data/mock_manifests/Valid_none_value_test_manifest.csv", + rootNode="Biospecimen", + ) + assert not warnings + assert not errors + + def test_validate_model_manifest_invalid(self, helpers: Helpers) -> None: + """ + Tests for validateModelManifest when the manifest requires values to be from a set of values containing 'None' + + Args: + helpers: Test helper functions + """ + meta_data_model = metadata_model(helpers, "class_label") + + errors, warnings = meta_data_model.validateModelManifest( + manifestPath="tests/data/mock_manifests/Invalid_none_value_test_manifest.csv", + rootNode="Biospecimen", + ) + assert not warnings + assert errors[0][0] == "6" + assert errors[0][1] == "Tissue Status" + assert errors[0][3] == "InvalidValue" + # The order of the valid values in the error message are random, so the test must be + # slightly complicated: + # 'InvalidValue' is not one of ['Malignant', 'Healthy', 'None'] + # 'InvalidValue' is not one of ['Healthy', 'Malignant', 'None'] + error_message = errors[0][2] + assert isinstance(error_message, str) + assert error_message.startswith("'InvalidValue' is not one of") diff --git a/tests/integration/test_store_synapse.py b/tests/integration/test_store_synapse.py index 085a48fc3..5c39acc3c 100644 --- a/tests/integration/test_store_synapse.py +++ b/tests/integration/test_store_synapse.py @@ -1,10 +1,17 @@ -from unittest.mock import MagicMock +from re import compile as re_compile +from unittest.mock import MagicMock, create_autospec, patch import numpy as np +import pandas as pd import pytest +from synapseclient import Synapse +from synapseclient.core.cache import Cache +from synapseclient.table import build_table +from schematic.configuration.configuration import Configuration from schematic.schemas.data_model_graph import DataModelGraphExplorer from schematic.store.synapse import SynapseStorage +from schematic.utils.general import SYN_ID_REGEX from schematic.utils.validate_utils import comma_separated_list_regex @@ -125,3 +132,327 @@ def test_process_row_annotations_get_validation( "value2", "value3", ] + + @pytest.mark.parametrize( + "asset_view, dataset_id, expected_files", + [ + ( + "syn23643253", + "syn61374924", + [ + ("syn61374926", "schematic - main/BulkRNASeq and files/txt1.txt"), + ("syn61374930", "schematic - main/BulkRNASeq and files/txt2.txt"), + ("syn62282794", "schematic - main/BulkRNASeq and files/txt3.txt"), + ("syn62282720", "schematic - main/BulkRNASeq and files/txt4.txt"), + ], + ), + ( + "syn23643253", + "syn25614635", + [ + ( + "syn25614636", + "schematic - main/TestDatasets/TestDataset-Annotations-v3/Sample_A.txt", + ), + ( + "syn25614637", + "schematic - main/TestDatasets/TestDataset-Annotations-v3/Sample_B.txt", + ), + ( + "syn25614638", + "schematic - main/TestDatasets/TestDataset-Annotations-v3/Sample_C.txt", + ), + ], + ), + ( + "syn63917487", + "syn63917494", + [ + ( + "syn63917520", + "schematic - main/Test files and dataset annotations/Test BulkRNAseq w annotation/txt1.txt", + ), + ( + "syn63917521", + "schematic - main/Test files and dataset annotations/Test BulkRNAseq w annotation/txt2.txt", + ), + ( + "syn63917522", + "schematic - main/Test files and dataset annotations/Test BulkRNAseq w annotation/txt3.txt", + ), + ( + "syn63917518", + "schematic - main/Test files and dataset annotations/Test BulkRNAseq w annotation/txt4.txt", + ), + ], + ), + ( + "syn23643253", + "syn63927665", + [ + ( + "syn63927671", + "schematic - main/BulkRNAseq nested files/data/txt1.txt", + ), + ( + "syn63927672", + "schematic - main/BulkRNAseq nested files/data/txt2.txt", + ), + ( + "syn63927673", + "schematic - main/BulkRNAseq nested files/data/txt3.txt", + ), + ( + "syn63927670", + "schematic - main/BulkRNAseq nested files/data/txt4.txt", + ), + ], + ), + ( + "syn23643253", + "syn63987067", + [ + ( + "syn63987072", + "schematic - main/BulkRNAseq and double nested files/dataset/folder 1/data/txt1.txt", + ), + ( + "syn63987073", + "schematic - main/BulkRNAseq and double nested files/dataset/folder 1/data/txt2.txt", + ), + ( + "syn63987074", + "schematic - main/BulkRNAseq and double nested files/dataset/folder 1/data/txt3.txt", + ), + ( + "syn63987071", + "schematic - main/BulkRNAseq and double nested files/dataset/folder 1/data/txt4.txt", + ), + ], + ), + ], + ids=[ + "top level folder", + "nested dataset", + "contentType:dataset annotation", + "nested data files", + "doubly nested data files", + ], + ) + @pytest.mark.parametrize( + "filenames", + [None, ["txt1.txt", "txt2.txt"]], + ids=["no file filtering", "file filtering"], + ) + def test_get_files_in_storage_dataset( + self, filenames, asset_view, dataset_id, expected_files, synapse_store + ): + # GIVEN a SynapseStorage object with the appropriate asset view + synapse_store.storageFileview = asset_view + # WHEN getFilesInStorageDataset is called for the given dataset + dataset_files = synapse_store.getFilesInStorageDataset( + datasetId=dataset_id, fileNames=filenames + ) + # AND the filenames are filtered as appropriate + if filenames: + files_to_remove = [] + for f in expected_files: + retain = False + for name in filenames: + if name in f[1]: + retain = True + if not retain: + files_to_remove.append(f) + + for file in files_to_remove: + expected_files.remove(file) + + # THEN the expected files are returned + # AND there are no unexpected files + assert dataset_files == expected_files + # AND the (synId, path) order is correct + synapse_id_regex = re_compile(SYN_ID_REGEX) + if dataset_files: + assert synapse_id_regex.fullmatch(dataset_files[0][0]) + + @pytest.mark.parametrize( + "asset_view, dataset_id, exception, exception_message", + [ + ( + "syn23643253", + "syn64319379", + LookupError, + "Dataset syn64319379 could not be found", + ), + ("syn64367119", "syn61374924", ValueError, "Fileview syn64367119 is empty"), + ], + ids=[ + "empty dataset", + "empty view", + ], + ) + def test_get_files_in_storage_dataset_exception( + self, asset_view, dataset_id, exception, exception_message, synapse_store + ): + # GIVEN a SynapseStorage object with the appropriate asset view + synapse_store.storageFileview = asset_view + # AND the correct and up-to-date fileview + synapse_store.query_fileview(force_requery=True) + # WHEN getFilesInStorageDataset is called + # THEN the appropriate exception should be raised, with the appropriate message + with pytest.raises(exception, match=exception_message): + synapse_store.getFilesInStorageDataset(datasetId=dataset_id, fileNames=None) + + @pytest.mark.parametrize( + "asset_view, dataset_id, expected_files", + [ + ( + "syn23643253", + "syn61374924", + [ + ("syn61374926", "schematic - main/BulkRNASeq and files/txt1.txt"), + ("syn61374930", "schematic - main/BulkRNASeq and files/txt2.txt"), + ("syn62282794", "schematic - main/BulkRNASeq and files/txt3.txt"), + ("syn62282720", "schematic - main/BulkRNASeq and files/txt4.txt"), + ], + ), + ( + "syn23643253", + "syn25614635", + [ + ( + "syn25614636", + "schematic - main/TestDatasets/TestDataset-Annotations-v3/Sample_A.txt", + ), + ( + "syn25614637", + "schematic - main/TestDatasets/TestDataset-Annotations-v3/Sample_B.txt", + ), + ( + "syn25614638", + "schematic - main/TestDatasets/TestDataset-Annotations-v3/Sample_C.txt", + ), + ], + ), + ( + "syn63917487", + "syn63917494", + [ + ( + "syn63917520", + "schematic - main/Test files and dataset annotations/Test BulkRNAseq w annotation/txt1.txt", + ), + ( + "syn63917521", + "schematic - main/Test files and dataset annotations/Test BulkRNAseq w annotation/txt2.txt", + ), + ( + "syn63917522", + "schematic - main/Test files and dataset annotations/Test BulkRNAseq w annotation/txt3.txt", + ), + ( + "syn63917518", + "schematic - main/Test files and dataset annotations/Test BulkRNAseq w annotation/txt4.txt", + ), + ], + ), + ( + "syn23643253", + "syn63927665", + [ + ( + "syn63927671", + "schematic - main/BulkRNAseq nested files/data/txt1.txt", + ), + ( + "syn63927672", + "schematic - main/BulkRNAseq nested files/data/txt2.txt", + ), + ( + "syn63927673", + "schematic - main/BulkRNAseq nested files/data/txt3.txt", + ), + ( + "syn63927670", + "schematic - main/BulkRNAseq nested files/data/txt4.txt", + ), + ], + ), + ( + "syn23643253", + "syn63987067", + [ + ( + "syn63987072", + "schematic - main/BulkRNAseq and double nested files/dataset/folder 1/data/txt1.txt", + ), + ( + "syn63987073", + "schematic - main/BulkRNAseq and double nested files/dataset/folder 1/data/txt2.txt", + ), + ( + "syn63987074", + "schematic - main/BulkRNAseq and double nested files/dataset/folder 1/data/txt3.txt", + ), + ( + "syn63987071", + "schematic - main/BulkRNAseq and double nested files/dataset/folder 1/data/txt4.txt", + ), + ], + ), + ], + ids=[ + "top level folder", + "nested dataset", + "contentType:dataset annotation", + "nested data files", + "doubly nested data files", + ], + ) + @pytest.mark.parametrize( + "filenames", + [None, ["txt1.txt", "txt2.txt"]], + ids=["no file filtering", "file filtering"], + ) + def test_mock_get_files_in_storage_dataset( + self, + synapse_store, + filenames, + asset_view, + dataset_id, + expected_files, + ): + # GIVEN a test configuration + TEST_CONFIG = Configuration() + + with patch( + "schematic.store.synapse.CONFIG", return_value=TEST_CONFIG + ) as mock_config: + # AND the appropriate asset view + mock_config.synapse_master_fileview_id = asset_view + # AND the appropriately filtered filenames + if filenames: + files_to_remove = [] + for f in expected_files: + retain = False + for name in filenames: + if name in f[1]: + retain = True + if not retain: + files_to_remove.append(f) + + for file in files_to_remove: + expected_files.remove(file) + + # WHEN getFilesInStorageDataset is called for the given dataset + dataset_files = synapse_store.getFilesInStorageDataset( + datasetId=dataset_id, fileNames=filenames + ) + + # THEN the expected files are returned + # AND there are no unexpected files + assert dataset_files == expected_files + # AND the (synId, path) order is correct + synapse_id_regex = re_compile(SYN_ID_REGEX) + if dataset_files: + assert synapse_id_regex.fullmatch(dataset_files[0][0]) diff --git a/tests/test_api.py b/tests/test_api.py index 1f2d79add..842210d5b 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -14,11 +14,11 @@ from flask.testing import FlaskClient from opentelemetry import trace -from schematic.configuration.configuration import Configuration +from schematic.configuration.configuration import CONFIG, Configuration from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer from schematic.schemas.data_model_parser import DataModelParser +from schematic.utils.df_utils import read_csv from schematic.utils.general import create_temp_folder -from schematic.configuration.configuration import CONFIG logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -838,7 +838,7 @@ def test_generate_manifest_file_based_annotations( response_google_sheet = json.loads(response.data) # open the google sheet - google_sheet_df = pd.read_csv( + google_sheet_df = read_csv( response_google_sheet[0] + "/export?gid=0&format=csv" ) @@ -894,7 +894,7 @@ def test_generate_manifest_not_file_based_with_annotations( response_google_sheet = json.loads(response.data) # open the google sheet - google_sheet_df = pd.read_csv( + google_sheet_df = read_csv( response_google_sheet[0] + "/export?gid=0&format=csv" ) diff --git a/tests/test_manifest.py b/tests/test_manifest.py index ade80fbe9..2358019d9 100644 --- a/tests/test_manifest.py +++ b/tests/test_manifest.py @@ -5,6 +5,7 @@ import pandas as pd import pytest +from pandas.testing import assert_series_equal from schematic.configuration.configuration import Configuration from schematic.manifest.generator import ManifestGenerator @@ -103,6 +104,9 @@ def mock_create_blank_google_sheet(): @pytest.fixture(params=[True, False], ids=["sheet_url", "data_frame"]) def manifest(dataset_id, manifest_generator, request): + """ + Only seems to be used for TestManifestGenerator.test_get_manifest_first_time + """ # Rename request param for readability sheet_url = request.param @@ -763,13 +767,11 @@ def test_create_manifests( assert all_results == expected_result @pytest.mark.parametrize( - "component,datasetId,expected_file_based,expected_rows,expected_files", + "component, datasetId, expected_rows, expected_files, expected_annotations", [ - ("Biospecimen", "syn61260107", False, 4, None), ( "BulkRNA-seqAssay", "syn61374924", - True, 4, pd.Series( [ @@ -780,18 +782,50 @@ def test_create_manifests( ], name="Filename", ), + { + "File Format": pd.Series( + ["BAM", "CRAM", "CSV/TSV", ""], name="File Format" + ), + "Genome Build": pd.Series( + ["GRCh37", "GRCh38", "GRCm38", ""], name="Genome Build" + ), + }, + ), + ( + "BulkRNA-seqAssay", + "syn25614635", + 3, + pd.Series( + [ + "schematic - main/TestDatasets/TestDataset-Annotations-v3/Sample_A.txt", + "schematic - main/TestDatasets/TestDataset-Annotations-v3/Sample_B.txt", + "schematic - main/TestDatasets/TestDataset-Annotations-v3/Sample_C.txt", + ], + name="Filename", + ), + { + "File Format": pd.Series( + ["txt", "csv", "fastq"], name="File Format" + ), + "Year of Birth": pd.Series(["1980", "", ""], name="Year of Birth"), + }, ), ], - ids=["Record based", "File based"], + ids=[ + "top level folder", + "nested dataset", + ], ) + @pytest.mark.parametrize("use_annotations", [True, False]) def test_get_manifest_with_files( self, helpers, component, datasetId, - expected_file_based, expected_rows, expected_files, + expected_annotations, + use_annotations, ): """ Test to ensure that @@ -813,27 +847,37 @@ def test_get_manifest_with_files( path_to_data_model=path_to_data_model, graph=graph_data_model, root=component, - use_annotations=True, + use_annotations=use_annotations, ) - # WHEN a manifest is generated for the appropriate dataset as a dataframe + # WHEN a filebased manifest is generated for the appropriate dataset as a dataframe manifest = generator.get_manifest( dataset_id=datasetId, output_format="dataframe" ) - # AND it is determined if the manifest is filebased - is_file_based = "Filename" in manifest.columns - # AND the number of rows are checked n_rows = manifest.shape[0] # THEN the manifest should have the expected number of rows assert n_rows == expected_rows - # AND the manifest should be filebased or not as expected - assert is_file_based == expected_file_based + # AND the manifest should have the columns expected of filebased metadata + assert "Filename" in manifest.columns + assert "entityId" in manifest.columns + + # AND the manifest should have the expected files from the dataset + assert_series_equal( + manifest["Filename"], + expected_files, + check_dtype=False, + ) - # AND if the manifest is file based - if expected_file_based: - # THEN the manifest should have the expected files - assert manifest["Filename"].equals(expected_files) + # AND if annotations are used to generate the manifest + if use_annotations: + # THEN the annotations in the generated manifest should match the expected annotations + for attribute, annotations in expected_annotations.items(): + assert_series_equal( + manifest[attribute], + annotations, + check_dtype=False, + ) diff --git a/tests/test_schemas.py b/tests/test_schemas.py index 409c653b0..670aa2474 100644 --- a/tests/test_schemas.py +++ b/tests/test_schemas.py @@ -462,7 +462,7 @@ def test_generate_data_model_graph(self, helpers, data_model, data_model_labels) ) # Check Edge directions - assert 4 == (len(graph.out_edges("TissueStatus"))) + assert 6 == (len(graph.out_edges("TissueStatus"))) assert 2 == (len(graph.in_edges("TissueStatus"))) diff --git a/tests/test_store.py b/tests/test_store.py index c9f32bec2..49b79a3c0 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -11,7 +11,7 @@ import uuid from contextlib import nullcontext as does_not_raise from typing import Any, Callable, Generator -from unittest.mock import AsyncMock, patch +from unittest.mock import AsyncMock, MagicMock, patch import pandas as pd import pytest @@ -22,12 +22,14 @@ from synapseclient.entity import File, Project from synapseclient.models import Annotations from synapseclient.models import Folder as FolderModel +from synapseclient.table import build_table from schematic.configuration.configuration import CONFIG, Configuration from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer from schematic.schemas.data_model_parser import DataModelParser from schematic.store.base import BaseStorage from schematic.store.synapse import DatasetFileView, ManifestDownload, SynapseStorage +from schematic.utils.df_utils import STR_NA_VALUES_FILTERED from schematic.utils.general import check_synapse_cache_size, create_temp_folder from tests.conftest import Helpers from tests.utils import CleanupItem @@ -463,46 +465,103 @@ def test_getDatasetProject(self, dataset_id, synapse_store): ( True, [ - ("syn126", "schematic - main/parent_folder/test_file"), + ("syn126", "syn_mock", "schematic - main/parent_folder/test_file"), ( "syn125", + "syn_mock", "schematic - main/parent_folder/test_folder/test_file_2", ), ], ), - (False, [("syn126", "test_file"), ("syn125", "test_file_2")]), + ( + False, + [ + ("syn126", "syn_mock", "test_file"), + ("syn125", "syn_mock", "test_file_2"), + ], + ), ], ) def test_getFilesInStorageDataset(self, synapse_store, full_path, expected): - mock_return = [ + mock_table_dataframe_return = pd.DataFrame( + { + "id": ["syn126", "syn125"], + "parentId": ["syn_mock", "syn_mock"], + "path": [ + "schematic - main/parent_folder/test_file", + "schematic - main/parent_folder/test_folder/test_file_2", + ], + } + ) + + with patch.object( + synapse_store, "storageFileviewTable", mock_table_dataframe_return + ), patch.object(synapse_store, "query_fileview") as mocked_query: + # query_fileview is the function called to get the fileview + mocked_query.return_value = mock_table_dataframe_return + + file_list = synapse_store.getFilesInStorageDataset( + datasetId="syn_mock", fileNames=None, fullpath=full_path + ) + assert file_list == expected + + @pytest.mark.parametrize( + "mock_table_dataframe, raised_exception, exception_message", + [ ( - ("parent_folder", "syn123"), - [("test_folder", "syn124")], - [("test_file", "syn126")], + pd.DataFrame( + { + "id": ["child_syn_mock"], + "path": ["schematic - main/parent_folder/child_entity"], + "parentId": ["wrong_syn_mock"], + } + ), + LookupError, + "Dataset syn_mock could not be found", ), ( - ( - os.path.join("schematic - main", "parent_folder", "test_folder"), - "syn124", + pd.DataFrame( + { + "id": [], + "path": [], + "parentId": [], + } ), - [], - [("test_file_2", "syn125")], + ValueError, + "Fileview mock_table_id is empty", ), - ] - with patch( - "synapseutils.walk_functions._help_walk", return_value=mock_return - ) as mock_walk_patch, patch( - "schematic.store.synapse.SynapseStorage.getDatasetProject", - return_value="syn23643250", - ) as mock_project_id_patch, patch( - "synapseclient.entity.Entity.__getattr__", return_value="schematic - main" - ) as mock_project_name_patch, patch.object( - synapse_store.syn, "get", return_value=Project(name="schematic - main") - ): - file_list = synapse_store.getFilesInStorageDataset( - datasetId="syn_mock", fileNames=None, fullpath=full_path - ) - assert file_list == expected + ], + ids=["missing_dataset", "empty_fileview"], + ) + @pytest.mark.parametrize( + "full_path,", + [ + (True), + (False), + ], + ) + def test_get_files_in_storage_dataset_exception( + self, + synapse_store, + mock_table_dataframe, + raised_exception, + exception_message, + full_path, + ): + with patch.object( + synapse_store, "storageFileviewTable", mock_table_dataframe + ), patch.object( + synapse_store, "storageFileview", "mock_table_id" + ) as mock_table_id, patch.object( + synapse_store, "query_fileview" + ) as mocked_query: + # query_fileview is the function called to get the fileview + mocked_query.return_value = mock_table_dataframe + + with pytest.raises(raised_exception, match=exception_message): + synapse_store.getFilesInStorageDataset( + datasetId="syn_mock", fileNames=None, fullpath=full_path + ) @pytest.mark.parametrize("downloadFile", [True, False]) def test_getDatasetManifest(self, synapse_store, downloadFile): @@ -926,6 +985,26 @@ async def mock_success_coro() -> None: await synapse_store._process_store_annos(new_tasks) mock_store_async.assert_not_called() + @pytest.mark.parametrize( + "dataset_id, dataset_folder_list, expected_clause", + [ + ("syn12345678", None, "parentId='syn12345678'"), + ( + "syn63927665", + ["syn63927665", "syn63927667"], + "parentId IN ('syn63927665', 'syn63927667')", + ), + (None, None, ""), + ], + ) + def test_build_clause_from_dataset_id( + self, dataset_id, dataset_folder_list, expected_clause + ): + dataset_clause = SynapseStorage.build_clause_from_dataset_id( + dataset_id=dataset_id, dataset_folder_list=dataset_folder_list + ) + assert dataset_clause == expected_clause + class TestDatasetFileView: def test_init(self, dataset_id, dataset_fileview, synapse_store): @@ -1244,7 +1323,7 @@ async def copy_folder_and_update_manifest( table_id = synapse_store.syn.findEntityId(name=table_name, parent=projectId) days_to_follow_up = ( synapse_store.syn.tableQuery(f"SELECT {column_of_interest} FROM {table_id}") - .asDataFrame() + .asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False) .squeeze() ) @@ -1281,7 +1360,7 @@ async def copy_folder_and_update_manifest( table_id = synapse_store.syn.findEntityId(name=table_name, parent=projectId) days_to_follow_up = ( synapse_store.syn.tableQuery(f"SELECT {column_of_interest} FROM {table_id}") - .asDataFrame() + .asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False) .squeeze() ) @@ -1343,7 +1422,7 @@ async def test_upsert_table( # Query table for DaystoFollowUp column table_query = ( synapse_store.syn.tableQuery(f"SELECT {column_of_interest} FROM {table_id}") - .asDataFrame() + .asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False) .squeeze() ) @@ -1384,7 +1463,7 @@ async def test_upsert_table( table_id = synapse_store.syn.findEntityId(name=table_name, parent=projectId) table_query = ( synapse_store.syn.tableQuery(f"SELECT {column_of_interest} FROM {table_id}") - .asDataFrame() + .asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False) .squeeze() ) diff --git a/tests/test_utils.py b/tests/test_utils.py index 2a0744439..4a8d30cd3 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -2,19 +2,11 @@ import json import logging import os -import tempfile -import time -from datetime import datetime -from pathlib import Path -from typing import Generator, Union import numpy as np import pandas as pd import pytest -import synapseclient.core.cache as cache -from _pytest.fixtures import FixtureRequest from pandas.testing import assert_frame_equal -from synapseclient.core.exceptions import SynapseHTTPError from schematic.models.metadata import MetadataModel from schematic.models.validate_manifest import ValidateManifest @@ -26,14 +18,8 @@ convert_graph_to_jsonld, ) from schematic.schemas.data_model_parser import DataModelParser -from schematic.utils import cli_utils, df_utils, general, io_utils, validate_utils -from schematic.utils.df_utils import load_df -from schematic.utils.general import ( - calculate_datetime, - check_synapse_cache_size, - clear_synapse_cache, - entity_type_mapping, -) +from schematic.utils import cli_utils, df_utils, io_utils, validate_utils +from schematic.utils.df_utils import load_df, read_csv from schematic.utils.schema_utils import ( check_for_duplicate_components, check_if_display_name_is_valid_label, @@ -168,13 +154,6 @@ DATA_MODEL_DICT = {"example.model.csv": "CSV", "example.model.jsonld": "JSONLD"} -test_disk_storage = [ - (2, 4000, 16000), - (1000, 4000, 16000), - (2000000, 1900000, 2000000), - (1073741825, 1073741824, 1181116006.4), -] - def get_metadataModel(helpers, model_name: str): metadataModel = MetadataModel( @@ -185,164 +164,6 @@ def get_metadataModel(helpers, model_name: str): return metadataModel -# create temporary files with various size based on request -@pytest.fixture() -def create_temp_query_file( - tmp_path: Path, request: FixtureRequest -) -> Generator[tuple[Path, Path, Path], None, None]: - """create temporary files of various size based on request parameter. - - Args: - tmp_path (Path): temporary file path - request (any): a request for a fixture from a test - - Yields: - Generator[Tuple[Path, Path, Path]]: return path of mock synapse cache directory, mock table query folder and csv - """ - # define location of mock synapse cache - mock_synapse_cache_dir = tmp_path / ".synapseCache/" - mock_synapse_cache_dir.mkdir() - mock_sub_folder = mock_synapse_cache_dir / "123" - mock_sub_folder.mkdir() - mock_table_query_folder = mock_sub_folder / "456" - mock_table_query_folder.mkdir() - - # create mock table query csv - mock_synapse_table_query_csv = ( - mock_table_query_folder / "mock_synapse_table_query.csv" - ) - with open(mock_synapse_table_query_csv, "wb") as f: - f.write(b"\0" * request.param) - yield mock_synapse_cache_dir, mock_table_query_folder, mock_synapse_table_query_csv - - -class TestGeneral: - @pytest.mark.parametrize("create_temp_query_file", [3, 1000], indirect=True) - def test_clear_synapse_cache(self, create_temp_query_file) -> None: - # define location of mock synapse cache - ( - mock_synapse_cache_dir, - mock_table_query_folder, - mock_synapse_table_query_csv, - ) = create_temp_query_file - # create a mock cache map - mock_cache_map = mock_table_query_folder / ".cacheMap" - mock_cache_map.write_text( - f"{mock_synapse_table_query_csv}: '2022-06-13T19:24:27.000Z'" - ) - - assert os.path.exists(mock_synapse_table_query_csv) - - # since synapse python client would compare last modified date and before date - # we have to create a little time gap here - time.sleep(1) - - # clear cache - my_cache = cache.Cache(cache_root_dir=mock_synapse_cache_dir) - clear_synapse_cache(my_cache, minutes=0.0001) - # make sure that cache files are now gone - assert os.path.exists(mock_synapse_table_query_csv) == False - assert os.path.exists(mock_cache_map) == False - - def test_calculate_datetime_before_minutes(self): - input_date = datetime.strptime("07/20/23 17:36:34", "%m/%d/%y %H:%M:%S") - minutes_before = calculate_datetime( - input_date=input_date, minutes=10, before_or_after="before" - ) - expected_result_date_before = datetime.strptime( - "07/20/23 17:26:34", "%m/%d/%y %H:%M:%S" - ) - assert minutes_before == expected_result_date_before - - def test_calculate_datetime_after_minutes(self): - input_date = datetime.strptime("07/20/23 17:36:34", "%m/%d/%y %H:%M:%S") - minutes_after = calculate_datetime( - input_date=input_date, minutes=10, before_or_after="after" - ) - expected_result_date_after = datetime.strptime( - "07/20/23 17:46:34", "%m/%d/%y %H:%M:%S" - ) - assert minutes_after == expected_result_date_after - - def test_calculate_datetime_raise_error(self): - with pytest.raises(ValueError): - input_date = datetime.strptime("07/20/23 17:36:34", "%m/%d/%y %H:%M:%S") - minutes = calculate_datetime( - input_date=input_date, minutes=10, before_or_after="error" - ) - - # this test might fail for windows machine - @pytest.mark.not_windows - @pytest.mark.parametrize( - "create_temp_query_file,local_disk_size,gh_disk_size", - test_disk_storage, - indirect=["create_temp_query_file"], - ) - def test_check_synapse_cache_size( - self, - create_temp_query_file, - local_disk_size: int, - gh_disk_size: Union[int, float], - ) -> None: - mock_synapse_cache_dir, _, _ = create_temp_query_file - disk_size = check_synapse_cache_size(mock_synapse_cache_dir) - - # For some reasons, when running in github action, the size of file changes. - if IN_GITHUB_ACTIONS: - assert disk_size == gh_disk_size - else: - assert disk_size == local_disk_size - - def test_find_duplicates(self): - mock_list = ["foo", "bar", "foo"] - mock_dups = {"foo"} - - test_dups = general.find_duplicates(mock_list) - assert test_dups == mock_dups - - def test_dict2list_with_dict(self): - mock_dict = {"foo": "bar"} - mock_list = [{"foo": "bar"}] - - test_list = general.dict2list(mock_dict) - assert test_list == mock_list - - def test_dict2list_with_list(self): - # mock_dict = {'foo': 'bar'} - mock_list = [{"foo": "bar"}] - - test_list = general.dict2list(mock_list) - assert test_list == mock_list - - @pytest.mark.parametrize( - "entity_id,expected_type", - [ - ("syn27600053", "folder"), - ("syn29862078", "file"), - ("syn23643253", "asset view"), - ("syn30988314", "folder"), - ("syn51182432", "org.sagebionetworks.repo.model.table.TableEntity"), - ], - ) - def test_entity_type_mapping(self, synapse_store, entity_id, expected_type): - syn = synapse_store.syn - - entity_type = entity_type_mapping(syn, entity_id) - assert entity_type == expected_type - - def test_entity_type_mapping_invalid_entity_id(self, synapse_store): - syn = synapse_store.syn - - # test with an invalid entity id - with pytest.raises(SynapseHTTPError) as exception_info: - entity_type_mapping(syn, "syn123456") - - def test_download_manifest_to_temp_folder(self): - with tempfile.TemporaryDirectory() as tmpdir: - path_dir = general.create_temp_folder(tmpdir) - assert os.path.exists(path_dir) - - class TestCliUtils: def test_query_dict(self): mock_dict = {"k1": {"k2": {"k3": "foobar"}}} @@ -473,7 +294,7 @@ def test_load_df(self, helpers, preserve_raw_input): test_col = "Check NA" file_path = helpers.get_data_path("mock_manifests", "Invalid_Test_Manifest.csv") - unprocessed_df = pd.read_csv(file_path, encoding="utf8") + unprocessed_df = read_csv(file_path, encoding="utf8") df = df_utils.load_df( file_path, preserve_raw_input=preserve_raw_input, data_model=False ) @@ -1086,6 +907,11 @@ def test_validate_property_schema(self, helpers): "example.model.csv", "Patient", ), + ( + "mock_manifests/Valid_none_value_test_manifest.csv", + "example.model.csv", + "Biospecimen", + ), ( "mock_manifests/Valid_Test_Manifest_with_nones.csv", "example_test_nones.model.csv", @@ -1100,7 +926,7 @@ def test_convert_nan_entries_to_empty_strings( manifest_path = helpers.get_data_path(manifest) model_path = helpers.get_data_path(model) - ## Gather parmeters needed to run validate_manifest_rules + # Gather parmeters needed to run validate_manifest_rules errors = [] load_args = { "dtype": "string", @@ -1122,13 +948,13 @@ def test_convert_nan_entries_to_empty_strings( **load_args, ) - metadataModel = get_metadataModel(helpers, model) + get_metadataModel(helpers, model) # Instantiate Validate manifest, and run manifest validation # In this step the manifest is modified while running rule # validation so need to do this step to get the updated manfest. vm = ValidateManifest(errors, manifest, manifest_path, dmge, json_schema) - manifest, vmr_errors, vmr_warnings = vm.validate_manifest_rules( + manifest, _, _ = vm.validate_manifest_rules( manifest, dmge, restrict_rules=False, @@ -1143,6 +969,10 @@ def test_convert_nan_entries_to_empty_strings( if root_node == "Patient": assert manifest["Family History"][0] == [""] assert output["Family History"][0] == [""] + elif root_node == "Biospecimen": + assert output["Tissue Status"][2] == "None" + assert output["Tissue Status"][3] == "None" + assert output["Tissue Status"][4] == "None" elif root_node == "MockComponent": assert manifest["Check List"][2] == [""] assert manifest["Check List Like Enum"][2] == [] diff --git a/tests/test_validation.py b/tests/test_validation.py index c16a95bb7..646a2e543 100644 --- a/tests/test_validation.py +++ b/tests/test_validation.py @@ -54,11 +54,17 @@ class TestManifestValidation: "mock_manifests/Valid_Test_Manifest_with_nones.csv", "MockComponent", ), + ( + "example.model.csv", + "mock_manifests/Valid_none_value_test_manifest.csv", + "Biospecimen", + ), ], ids=[ "example_model", "example_with_no_entry_for_cond_required_columns", - "example_with_nones", + "example_with_nones_mock_component", + "example_with_nones_biospecimen", ], ) @pytest.mark.parametrize( diff --git a/tests/test_viz.py b/tests/test_viz.py index b94d79688..2ab78b9ce 100644 --- a/tests/test_viz.py +++ b/tests/test_viz.py @@ -1,11 +1,10 @@ import json import logging -import os from io import StringIO -import pandas as pd import pytest +from schematic.utils.df_utils import read_csv from schematic.visualization.attributes_explorer import AttributesExplorer from schematic.visualization.tangled_tree import TangledTree @@ -44,7 +43,7 @@ class TestVisualization: def test_ae(self, helpers, attributes_explorer): attributes_str = attributes_explorer.parse_attributes(save_file=False) - df = pd.read_csv(StringIO(attributes_str)).drop(columns=["Unnamed: 0"]) + df = read_csv(StringIO(attributes_str)).drop(columns=["Unnamed: 0"]) # For the attributes df define expected columns expect_col_names = [ @@ -76,7 +75,7 @@ def test_ce(self, component, attributes_explorer): component=component, save_file=False, include_index=False ) # convert to dataframe - component_attributes = pd.read_csv(StringIO(component_attributes_str)) + component_attributes = read_csv(StringIO(component_attributes_str)) # For the attributes df define expected columns expect_col_names = [ @@ -103,7 +102,7 @@ def test_text(self, helpers, tangled_tree): # Get text for tangled tree. text_str = tangled_tree.get_text_for_tangled_tree(text_format, save_file=False) - df = pd.read_csv(StringIO(text_str)).drop(columns=["Unnamed: 0"]) + df = read_csv(StringIO(text_str)).drop(columns=["Unnamed: 0"]) # Define expected text associated with 'Patient' and 'Imaging' tree expected_patient_text = ["Biospecimen", "BulkRNA-seqAssay"] diff --git a/tests/unit/test_df_utils.py b/tests/unit/test_df_utils.py new file mode 100644 index 000000000..28db591a2 --- /dev/null +++ b/tests/unit/test_df_utils.py @@ -0,0 +1,49 @@ +from io import BytesIO + +import numpy as np +import pandas as pd +from pandas._libs.parsers import STR_NA_VALUES + +from schematic.utils.df_utils import read_csv + + +class TestReadCsv: + def test_none_in_na_values(self) -> None: + # GIVEN a pandas DF that includes a column with a None value + df = pd.DataFrame({"col1": ["AAA", "BBB", "None"]}) + + # AND None is included in the STR_NA_VALUES + if "None" not in STR_NA_VALUES: + STR_NA_VALUES.add("None") + + # AND its CSV representation + csv_buffer = BytesIO() + df.to_csv(csv_buffer, index=False) + csv_buffer.seek(0) + + # WHEN the CSV is read + result = read_csv(csv_buffer, na_values=STR_NA_VALUES) + + # THEN the None string value is not preserved + assert not result.equals(df) + assert result["col1"][0] == "AAA" + assert result["col1"][1] == "BBB" + assert result["col1"][2] is np.nan + + def test_none_not_in_na_values(self) -> None: + # GIVEN a pandas DF that includes a column with a None value + df = pd.DataFrame({"col1": ["AAA", "BBB", "None"]}) + + # AND its CSV representation + csv_buffer = BytesIO() + df.to_csv(csv_buffer, index=False) + csv_buffer.seek(0) + + # WHEN the CSV is read + result = read_csv(csv_buffer) + + # THEN the None string value is preserved + assert result.equals(df) + assert result["col1"][0] == "AAA" + assert result["col1"][1] == "BBB" + assert result["col1"][2] == "None" diff --git a/tests/unit/test_store_synapse.py b/tests/unit/test_store_synapse.py new file mode 100644 index 000000000..c2b8d1403 --- /dev/null +++ b/tests/unit/test_store_synapse.py @@ -0,0 +1,59 @@ +import pandas as pd +import pytest + +from schematic.store.synapse import SynapseStorage + + +class TestStoreSynapse: + synapse_store = SynapseStorage() + test_manifest_good = pd.DataFrame( + {"Filename": ["test_file.txt"], "entityId": ["syn1"]} + ) + test_manifest_no_entityid = pd.DataFrame({"Filename": ["test_file.txt"]}) + test_dataset_files = [ + ("syn2", "test_file_1.txt"), + ("syn3", "test_file_2.txt"), + ] + + def test_get_file_entityIds_only_new_files_manifest_is_none(self) -> None: + with pytest.raises(UnboundLocalError, match="No manifest was passed in"): + self.synapse_store._get_file_entityIds( + dataset_files=[], only_new_files=True, manifest=None + ) + + def test_get_file_entityIds_only_new_files_manifest_no_entityId_column( + self, + ) -> None: + with pytest.raises(ValueError, match="The manifest in your dataset"): + self.synapse_store._get_file_entityIds( + dataset_files=[], + only_new_files=True, + manifest=self.test_manifest_no_entityid, + ) + + def test_get_file_entityIds_only_new_files_manifest_good(self) -> None: + assert self.synapse_store._get_file_entityIds( + dataset_files=self.test_dataset_files, + only_new_files=True, + manifest=self.test_manifest_good, + ) == { + "Filename": ["test_file_1.txt", "test_file_2.txt"], + "entityId": ["syn2", "syn3"], + } + + def test_get_file_entityIds_only_new_files_manifest_good_no_new_files(self) -> None: + assert self.synapse_store._get_file_entityIds( + dataset_files=[("syn1", "test_file.txt")], + only_new_files=True, + manifest=self.test_manifest_good, + ) == {"Filename": [], "entityId": []} + + def test_get_file_entityIds_all_files(self) -> None: + assert self.synapse_store._get_file_entityIds( + dataset_files=self.test_dataset_files, + only_new_files=False, + manifest=self.test_manifest_good, + ) == { + "Filename": ["test_file_1.txt", "test_file_2.txt"], + "entityId": ["syn2", "syn3"], + } diff --git a/tests/unit/test_utils_general.py b/tests/unit/test_utils_general.py new file mode 100644 index 000000000..ed8a7913f --- /dev/null +++ b/tests/unit/test_utils_general.py @@ -0,0 +1,236 @@ +import os +import tempfile +import time +from datetime import datetime +from pathlib import Path +from typing import Generator +from unittest import mock +from unittest.mock import MagicMock + +import pytest +import synapseclient.core.cache as cache +from _pytest.fixtures import FixtureRequest +from synapseclient.core.exceptions import SynapseHTTPError + +from schematic.utils import general +from schematic.utils.general import ( + calculate_datetime, + check_synapse_cache_size, + clear_synapse_cache, + entity_type_mapping, +) + +TEST_DISK_STORAGE = [ + (2, 2), + (1000, 1000), + (2000000, 2000000), + (1073741825, 1073741825), +] + + +# create temporary files with various size based on request +@pytest.fixture() +def create_temp_query_file( + tmp_path: Path, request: FixtureRequest +) -> Generator[tuple[Path, Path, Path], None, None]: + """create temporary files of various size based on request parameter. + + Args: + tmp_path (Path): temporary file path + request (any): a request for a fixture from a test + + Yields: + Generator[Tuple[Path, Path, Path]]: return path of mock synapse cache directory, mock table query folder and csv + """ + # define location of mock synapse cache + mock_synapse_cache_dir = tmp_path / ".synapseCache/" + mock_synapse_cache_dir.mkdir() + mock_sub_folder = mock_synapse_cache_dir / "123" + mock_sub_folder.mkdir() + mock_table_query_folder = mock_sub_folder / "456" + mock_table_query_folder.mkdir() + + # create mock table query csv + mock_synapse_table_query_csv = ( + mock_table_query_folder / "mock_synapse_table_query.csv" + ) + with open(mock_synapse_table_query_csv, "wb") as f: + f.write(b"\0" * request.param) + yield mock_synapse_cache_dir, mock_table_query_folder, mock_synapse_table_query_csv + + +@pytest.mark.parametrize( + "directory, file_sizes, expected_size", + [ + ("", [1024], 1024), # Default directory with 1 KB file + ("/custom/directory", [2048], 2048), # Custom directory with 2 KB file + ("", [], 0), # Empty directory + ("", [1024, 2048], 3072), # Directory with multiple files (1 KB + 2 KB) + ], +) +def test_check_synapse_cache_size(mocker, directory, file_sizes, expected_size): + """Test that the file sizes add up to the correct bytes""" + # Create a list of mocked files based on file_sizes + mock_files = [] + for size in file_sizes: + mock_file = MagicMock() + mock_file.is_file.return_value = True + mock_file.stat.return_value.st_size = size + mock_files.append(mock_file) + + # Mock Path().rglob() to return the mocked files + mock_rglob = mocker.patch( + "schematic.utils.general.Path.rglob", return_value=mock_files + ) + # Call the function with the directory parameter + result = check_synapse_cache_size(directory=directory) + + # Assert the result matches the expected size + assert result == expected_size + + +@pytest.mark.parametrize( + "directory", + [ + None, # Default directory + "/custom/directory", # Custom directory + ], +) +def test_check_synapse_cache_size_directory_call(directory): + """Test that the right directory is passed in""" + with mock.patch("schematic.utils.general.Path") as mock_path: + mock_rglob = MagicMock(return_value=[]) + mock_path.return_value.rglob = mock_rglob + + # Call the function with the directory parameter + if directory is None: + check_synapse_cache_size() + else: + check_synapse_cache_size(directory=directory) + + # Assert that Path was called with the correct directory + expected_directory = directory if directory else "/root/.synapseCache" + mock_path.assert_called_with(expected_directory) + + # Assert that rglob was called on the Path object + mock_rglob.assert_called_once_with("*") + + +class TestGeneral: + @pytest.mark.parametrize("create_temp_query_file", [3, 1000], indirect=True) + def test_clear_synapse_cache(self, create_temp_query_file) -> None: + # define location of mock synapse cache + ( + mock_synapse_cache_dir, + mock_table_query_folder, + mock_synapse_table_query_csv, + ) = create_temp_query_file + # create a mock cache map + mock_cache_map = mock_table_query_folder / ".cacheMap" + mock_cache_map.write_text( + f"{mock_synapse_table_query_csv}: '2022-06-13T19:24:27.000Z'" + ) + + assert os.path.exists(mock_synapse_table_query_csv) + + # since synapse python client would compare last modified date and before date + # we have to create a little time gap here + time.sleep(1) + + # clear cache + my_cache = cache.Cache(cache_root_dir=mock_synapse_cache_dir) + clear_synapse_cache(my_cache, minutes=0.0001) + # make sure that cache files are now gone + assert os.path.exists(mock_synapse_table_query_csv) == False + assert os.path.exists(mock_cache_map) == False + + def test_calculate_datetime_before_minutes(self): + input_date = datetime.strptime("07/20/23 17:36:34", "%m/%d/%y %H:%M:%S") + minutes_before = calculate_datetime( + input_date=input_date, minutes=10, before_or_after="before" + ) + expected_result_date_before = datetime.strptime( + "07/20/23 17:26:34", "%m/%d/%y %H:%M:%S" + ) + assert minutes_before == expected_result_date_before + + def test_calculate_datetime_after_minutes(self): + input_date = datetime.strptime("07/20/23 17:36:34", "%m/%d/%y %H:%M:%S") + minutes_after = calculate_datetime( + input_date=input_date, minutes=10, before_or_after="after" + ) + expected_result_date_after = datetime.strptime( + "07/20/23 17:46:34", "%m/%d/%y %H:%M:%S" + ) + assert minutes_after == expected_result_date_after + + def test_calculate_datetime_raise_error(self): + with pytest.raises(ValueError): + input_date = datetime.strptime("07/20/23 17:36:34", "%m/%d/%y %H:%M:%S") + minutes = calculate_datetime( + input_date=input_date, minutes=10, before_or_after="error" + ) + + # this test might fail for windows machine + @pytest.mark.parametrize( + "create_temp_query_file,local_disk_size", + TEST_DISK_STORAGE, + indirect=["create_temp_query_file"], + ) + def test_check_synapse_cache_size( + self, + create_temp_query_file, + local_disk_size: int, + ) -> None: + mock_synapse_cache_dir, _, _ = create_temp_query_file + disk_size = check_synapse_cache_size(mock_synapse_cache_dir) + assert disk_size == local_disk_size + + def test_find_duplicates(self): + mock_list = ["foo", "bar", "foo"] + mock_dups = {"foo"} + + test_dups = general.find_duplicates(mock_list) + assert test_dups == mock_dups + + def test_dict2list_with_dict(self): + mock_dict = {"foo": "bar"} + mock_list = [{"foo": "bar"}] + + test_list = general.dict2list(mock_dict) + assert test_list == mock_list + + def test_dict2list_with_list(self): + # mock_dict = {'foo': 'bar'} + mock_list = [{"foo": "bar"}] + + test_list = general.dict2list(mock_list) + assert test_list == mock_list + + @pytest.mark.parametrize( + "entity_id,expected_type", + [ + ("syn27600053", "folder"), + ("syn29862078", "file"), + ("syn23643253", "asset view"), + ("syn30988314", "folder"), + ("syn51182432", "org.sagebionetworks.repo.model.table.TableEntity"), + ], + ) + def test_entity_type_mapping(self, synapse_store, entity_id, expected_type): + syn = synapse_store.syn + + entity_type = entity_type_mapping(syn, entity_id) + assert entity_type == expected_type + + def test_entity_type_mapping_invalid_entity_id(self, synapse_store): + syn = synapse_store.syn + + # test with an invalid entity id + with pytest.raises(SynapseHTTPError) as exception_info: + entity_type_mapping(syn, "syn123456") + + def test_download_manifest_to_temp_folder(self): + with tempfile.TemporaryDirectory() as tmpdir: + path_dir = general.create_temp_folder(tmpdir) + assert os.path.exists(path_dir)