diff --git a/.github/workflows/api_test.yml b/.github/workflows/api_test.yml
index 0dea21710..3e48bd09a 100644
--- a/.github/workflows/api_test.yml
+++ b/.github/workflows/api_test.yml
@@ -13,7 +13,7 @@ on:
 
 jobs:
   test:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     env:
       POETRY_VERSION:  1.3.0
     strategy:
diff --git a/.github/workflows/pdoc.yml b/.github/workflows/pdoc.yml
index 818bd3a61..d24ef8652 100644
--- a/.github/workflows/pdoc.yml
+++ b/.github/workflows/pdoc.yml
@@ -25,7 +25,7 @@ concurrency:
   cancel-in-progress: true
 jobs:
   build:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     env:
       POETRY_VERSION:  1.3.0
       PYTHON_VERSION: "3.10"
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 5b85c995f..8f2e686ea 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -7,9 +7,10 @@ on:
 
 jobs:
   pypi_release:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     env:
       POETRY_VERSION:  1.3.0
+      PYTHON_VERSION: "3.10"
     if: github.event_name == 'push' && contains(github.ref, 'refs/tags')
     steps:
       #----------------------------------------------
@@ -18,10 +19,10 @@ jobs:
       - name: Check out repository
         uses: actions/checkout@v4
 
-      - name: Set up Python ${{ matrix.python-version }}
+      - name: Set up Python ${{ env.PYTHON_VERSION }}
         uses: actions/setup-python@v5
         with:
-          python-version: ${{ matrix.python-version }}
+          python-version: ${{ env.PYTHON_VERSION }}
 
       #----------------------------------------------
       #          install & configure poetry
@@ -48,7 +49,7 @@ jobs:
       - name: Get current pushed tag
         run:  |
           echo "RELEASE_VERSION=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV
-          echo ${{ env.RELEASE_VERSION }}
+          echo "$RELEASE_VERSION"
           
       #----------------------------------------------
       #    override version tag 
diff --git a/.github/workflows/scan_repo.yml b/.github/workflows/scan_repo.yml
index 582c68f45..434b4b522 100644
--- a/.github/workflows/scan_repo.yml
+++ b/.github/workflows/scan_repo.yml
@@ -12,6 +12,9 @@ jobs:
   trivy:
     name: Trivy
     runs-on: ubuntu-latest
+    env:
+      TRIVY_DB_REPOSITORY: public.ecr.aws/aquasecurity/trivy-db:2
+      TRIVY_JAVA_DB_REPOSITORY: public.ecr.aws/aquasecurity/trivy-java-db:1
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 535d7804f..0090d1558 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -25,7 +25,7 @@ concurrency:
   cancel-in-progress: true
 jobs:
   test:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     env:
       POETRY_VERSION:  1.3.0
     strategy:
@@ -127,11 +127,31 @@ jobs:
       #----------------------------------------------
       #              run integration test suite
       #----------------------------------------------
+
+      - name: Retrieve telemetry access token from IDP
+        if: ${{ contains(fromJSON('["3.10"]'), matrix.python-version) }}
+        id: retrieve-telemetry-access-token
+        run: |
+          response=$(curl --request POST \
+            --url ${{ vars.TELEMETRY_AUTH_CLIENT_URL }} \
+            --header 'content-type: application/json' \
+            --data '{"client_id":"${{ vars.TELEMETRY_AUTH_CLIENT_ID }}","client_secret":"${{ secrets.TELEMETRY_AUTH_CLIENT_SECRET }}","audience":"${{ vars.TELEMETRY_AUTH_AUDIENCE }}","grant_type":"client_credentials"}')
+          access_token=$(echo $response | jq -r .access_token)
+          echo "::add-mask::$access_token"
+          echo "TELEMETRY_ACCESS_TOKEN=$access_token" >> "$GITHUB_OUTPUT"
       - name: Run integration tests
         if: ${{ contains(fromJSON('["3.10"]'), matrix.python-version) }}
         env:
           SYNAPSE_ACCESS_TOKEN: ${{ secrets.SYNAPSE_ACCESS_TOKEN }}
           SERVICE_ACCOUNT_CREDS: ${{ secrets.SERVICE_ACCOUNT_CREDS }}
+          OTEL_EXPORTER_OTLP_HEADERS: "Authorization=Bearer ${{ steps.retrieve-telemetry-access-token.outputs.TELEMETRY_ACCESS_TOKEN }}"
+          DEPLOYMENT_ENVIRONMENT: ${{ vars.DEPLOYMENT_ENVIRONMENT }}
+          OTEL_EXPORTER_OTLP_ENDPOINT: ${{ vars.OTEL_EXPORTER_OTLP_ENDPOINT }}
+          TRACING_EXPORT_FORMAT: ${{ vars.TRACING_EXPORT_FORMAT }}
+          LOGGING_EXPORT_FORMAT: ${{ vars.LOGGING_EXPORT_FORMAT }}
+          TRACING_SERVICE_NAME: ${{ vars.TRACING_SERVICE_NAME }}
+          LOGGING_SERVICE_NAME: ${{ vars.LOGGING_SERVICE_NAME }}
+          SERVICE_INSTANCE_ID: ${{ github.head_ref || github.ref_name }} 
         run: >
           poetry run pytest --durations=0 --cov-append --cov-report=term --cov-report=html:htmlcov --cov-report=xml:coverage.xml --cov=schematic/
           -m "not (rule_benchmark or single_process_execution)" --reruns 4 -n 8 --ignore=tests/unit
@@ -141,6 +161,13 @@ jobs:
         env:
           SYNAPSE_ACCESS_TOKEN: ${{ secrets.SYNAPSE_ACCESS_TOKEN }}
           SERVICE_ACCOUNT_CREDS: ${{ secrets.SERVICE_ACCOUNT_CREDS }}
+          OTEL_EXPORTER_OTLP_HEADERS: "Authorization=Bearer ${{ steps.retrieve-telemetry-access-token.outputs.TELEMETRY_ACCESS_TOKEN }}"
+          DEPLOYMENT_ENVIRONMENT: ${{ vars.DEPLOYMENT_ENVIRONMENT }}
+          OTEL_EXPORTER_OTLP_ENDPOINT: ${{ vars.OTEL_EXPORTER_OTLP_ENDPOINT }}
+          TRACING_EXPORT_FORMAT: ${{ vars.TRACING_EXPORT_FORMAT }}
+          LOGGING_EXPORT_FORMAT: ${{ vars.LOGGING_EXPORT_FORMAT }}
+          TRACING_SERVICE_NAME: ${{ vars.TRACING_SERVICE_NAME }}
+          LOGGING_SERVICE_NAME: ${{ vars.LOGGING_SERVICE_NAME }}
         run: >
           poetry run pytest --durations=0 --cov-append --cov-report=term --cov-report=html:htmlcov --cov-report=xml:coverage.xml --cov=schematic/
           -m "single_process_execution" --reruns 4 --ignore=tests/unit
diff --git a/CONTRIBUTION.md b/CONTRIBUTION.md
index 4d8646e6c..974898f03 100644
--- a/CONTRIBUTION.md
+++ b/CONTRIBUTION.md
@@ -6,7 +6,7 @@ Please note we have a [code of conduct](CODE_OF_CONDUCT.md), please follow it in
 
 ## How to report bugs or feature requests
 
-You can **create bug and feature requests** through [Sage Bionetwork's FAIR Data service desk](https://sagebionetworks.jira.com/servicedesk/customer/portal/5/group/8). Providing enough details to the developers to verify and troubleshoot your issue is paramount:
+You can **create bug and feature requests** through [Sage Bionetwork's DPE schematic support](https://sagebionetworks.jira.com/servicedesk/customer/portal/5/group/7/create/225). Providing enough details to the developers to verify and troubleshoot your issue is paramount:
 - **Provide a clear and descriptive title as well as a concise summary** of the issue to identify the problem.
 - **Describe the exact steps which reproduce the problem** in as many details as possible.
 - **Describe the behavior you observed after following the steps** and point out what exactly is the problem with that behavior.
@@ -25,7 +25,7 @@ For new features, bugs, enhancements:
 
 #### 1. Branch Setup
 * Pull the latest code from the develop branch in the upstream repository.
-* Checkout a new branch formatted like so: `develop-<feature/fix-name>` from the develop branch
+* Checkout a new branch formatted like so: `<JIRA-ID>-<feature/fix-name>` from the develop branch
 
 #### 2. Development Workflow
 * Develop on your new branch.
@@ -35,11 +35,11 @@ For new features, bugs, enhancements:
 * You can choose to create a draft PR if you prefer to develop this way
 
 #### 3. Branch Management
-* Push code to `develop-<feature/fix-name>` in upstream repo:
+* Push code to `<JIRA-ID>-<feature/fix-name>` in upstream repo:
   ```
-  git push <upstream> develop-<feature/fix-name>
+  git push <upstream> <JIRA-ID>-<feature/fix-name>
   ```
-* Branch off `develop-<feature/fix-name>` if you need to work on multiple features associated with the same code base
+* Branch off `<JIRA-ID>-<feature/fix-name>` if you need to work on multiple features associated with the same code base
 * After feature work is complete and before creating a PR to the develop branch in upstream
     a. ensure that code runs locally
     b. test for logical correctness locally
@@ -47,10 +47,10 @@ For new features, bugs, enhancements:
     c. wait for git workflow to complete (e.g. tests are run) on github
 
 #### 4. Pull Request and Review
-* Create a PR from `develop-<feature/fix-name>` into the develop branch of the upstream repo
+* Create a PR from `<JIRA-ID>-<feature/fix-name>` into the develop branch of the upstream repo
 * Request a code review on the PR
 * Once code is approved merge in the develop branch. The **"Squash and merge"** strategy should be used for a cleaner commit history on the `develop` branch. The description of the squash commit should include enough information to understand the context of the changes that were made.
-* Once the actions pass on the main branch, delete the `develop-<feature/fix-name>` branch
+* Once the actions pass on the main branch, delete the `<JIRA-ID>-<feature/fix-name>` branch
 
 ### Updating readthedocs documentation
 1. Navigate to the docs directory.
diff --git a/Dockerfile b/Dockerfile
index 4a5ea43f9..7653ee335 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -29,4 +29,4 @@ RUN poetry install --no-interaction --no-ansi --no-root
 
 COPY . ./
 
-RUN poetry install --only-root
\ No newline at end of file
+RUN poetry install --only-root
diff --git a/env.example b/env.example
index f1f56a8a4..bf999908b 100644
--- a/env.example
+++ b/env.example
@@ -11,6 +11,8 @@ SERVICE_ACCOUNT_CREDS='Provide service account creds'
 # LOGGING_EXPORT_FORMAT=otlp
 # TRACING_SERVICE_NAME=schematic-api
 # LOGGING_SERVICE_NAME=schematic-api
+## Instance ID is used during integration tests export to identify the git branch
+# SERVICE_INSTANCE_ID=schematic-1234
 ## Other examples: dev, staging, prod
 # DEPLOYMENT_ENVIRONMENT=local
 # OTEL_EXPORTER_OTLP_ENDPOINT=https://..../telemetry
diff --git a/pyproject.toml b/pyproject.toml
index 39315d360..4a75a7ad3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "schematicpy"
-version = "24.11.2"
+version = "24.12.1"
 description = "Package for biomedical data model and metadata ingress management"
 authors = [
     "Milen Nikolov <milen.nikolov@sagebase.org>",
diff --git a/schematic/__init__.py b/schematic/__init__.py
index 718feb6a3..a43f53553 100644
--- a/schematic/__init__.py
+++ b/schematic/__init__.py
@@ -10,7 +10,13 @@
 from opentelemetry.instrumentation.flask import FlaskInstrumentor
 from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler
 from opentelemetry.sdk._logs.export import BatchLogRecordProcessor
-from opentelemetry.sdk.resources import DEPLOYMENT_ENVIRONMENT, SERVICE_NAME, Resource
+from opentelemetry.sdk.resources import (
+    DEPLOYMENT_ENVIRONMENT,
+    SERVICE_INSTANCE_ID,
+    SERVICE_NAME,
+    SERVICE_VERSION,
+    Resource,
+)
 from opentelemetry.sdk.trace import TracerProvider
 from opentelemetry.sdk.trace.export import BatchSpanProcessor, Span
 from opentelemetry.sdk.trace.sampling import ALWAYS_OFF
@@ -20,6 +26,7 @@
 
 from schematic.configuration.configuration import CONFIG
 from schematic.loader import LOADER
+from schematic.version import __version__
 from schematic_api.api.security_controller import info_from_bearer_auth
 
 Synapse.allow_client_caching(False)
@@ -91,16 +98,14 @@ def set_up_tracing(session: requests.Session) -> None:
         Synapse.enable_open_telemetry(True)
         tracing_service_name = os.environ.get("TRACING_SERVICE_NAME", "schematic-api")
         deployment_environment = os.environ.get("DEPLOYMENT_ENVIRONMENT", "")
+        service_instance_id = os.environ.get("SERVICE_INSTANCE_ID", "")
         trace.set_tracer_provider(
             TracerProvider(
                 resource=Resource(
                     attributes={
+                        SERVICE_INSTANCE_ID: service_instance_id,
                         SERVICE_NAME: tracing_service_name,
-                        # TODO: Revisit this portion later on. As of 11/12/2024 when
-                        # deploying this to ECS or running within a docker container,
-                        # the package version errors out with the following error:
-                        # importlib.metadata.PackageNotFoundError: No package metadata was found for schematicpy
-                        # SERVICE_VERSION: package_version,
+                        SERVICE_VERSION: __version__,
                         DEPLOYMENT_ENVIRONMENT: deployment_environment,
                     }
                 )
@@ -122,11 +127,14 @@ def set_up_logging(session: requests.Session) -> None:
     logging_export = os.environ.get("LOGGING_EXPORT_FORMAT", None)
     logging_service_name = os.environ.get("LOGGING_SERVICE_NAME", "schematic-api")
     deployment_environment = os.environ.get("DEPLOYMENT_ENVIRONMENT", "")
+    service_instance_id = os.environ.get("SERVICE_INSTANCE_ID", "")
     if logging_export == "otlp":
         resource = Resource.create(
             {
+                SERVICE_INSTANCE_ID: service_instance_id,
                 SERVICE_NAME: logging_service_name,
                 DEPLOYMENT_ENVIRONMENT: deployment_environment,
+                SERVICE_VERSION: __version__,
             }
         )
 
diff --git a/schematic/__main__.py b/schematic/__main__.py
index 22216f273..094624d7e 100644
--- a/schematic/__main__.py
+++ b/schematic/__main__.py
@@ -13,6 +13,7 @@
 from schematic.visualization.commands import (
     viz as viz_cli,
 )  # viz generation commands
+from schematic import __version__
 
 logger = logging.getLogger()
 click_log.basic_config(logger)
@@ -24,6 +25,7 @@
 # invoke_without_command=True -> forces the application not to show aids before losing them with a --h
 @click.group(context_settings=CONTEXT_SETTINGS, invoke_without_command=True)
 @click_log.simple_verbosity_option(logger)
+@click.version_option(version=__version__, prog_name="schematic")
 def main():
     """
     Command line interface to the `schematic` backend services.
diff --git a/schematic/manifest/generator.py b/schematic/manifest/generator.py
index 69b86e136..981e6d8e3 100644
--- a/schematic/manifest/generator.py
+++ b/schematic/manifest/generator.py
@@ -1904,6 +1904,8 @@ def get_manifest(
         # TODO: avoid explicitly exposing Synapse store functionality
         # just instantiate a Store class and let it decide at runtime/config
         # the store type
+        # TODO: determine which parts of fileview are necessary for `get` operations
+        # and pass query parameters at object instantiation to avoid having to re-query
         if access_token:
             # for getting an existing manifest on AWS
             store = SynapseStorage(access_token=access_token)
diff --git a/schematic/models/validate_attribute.py b/schematic/models/validate_attribute.py
index 9b13bebaf..69eea3fab 100644
--- a/schematic/models/validate_attribute.py
+++ b/schematic/models/validate_attribute.py
@@ -17,6 +17,7 @@
 
 from schematic.schemas.data_model_graph import DataModelGraphExplorer
 from schematic.store.synapse import SynapseStorage
+from schematic.utils.df_utils import read_csv
 from schematic.utils.validate_rules_utils import validation_rule_info
 from schematic.utils.validate_utils import (
     comma_separated_list_regex,
@@ -868,7 +869,7 @@ def _get_target_manifest_dataframes(
             entity: File = self.synStore.getDatasetManifest(
                 datasetId=dataset_id, downloadFile=True
             )
-            manifests.append(pd.read_csv(entity.path))
+            manifests.append(read_csv(entity.path))
         return dict(zip(manifest_ids, manifests))
 
     def get_target_manifests(
@@ -2119,7 +2120,9 @@ def filename_validation(
 
         where_clauses = []
 
-        dataset_clause = f"parentId='{dataset_scope}'"
+        dataset_clause = SynapseStorage.build_clause_from_dataset_id(
+            dataset_id=dataset_scope
+        )
         where_clauses.append(dataset_clause)
 
         self._login(
diff --git a/schematic/store/database/synapse_database_wrapper.py b/schematic/store/database/synapse_database_wrapper.py
index b827b140f..ba0ed2dc9 100644
--- a/schematic/store/database/synapse_database_wrapper.py
+++ b/schematic/store/database/synapse_database_wrapper.py
@@ -8,6 +8,7 @@
 from opentelemetry import trace
 
 from schematic.store.synapse_tracker import SynapseEntityTracker
+from schematic.utils.df_utils import read_csv
 
 
 class SynapseTableNameError(Exception):
@@ -108,7 +109,7 @@ def execute_sql_query(
             pandas.DataFrame: The queried table
         """
         result = self.execute_sql_statement(query, include_row_data)
-        table = pandas.read_csv(result.filepath)
+        table = read_csv(result.filepath)
         return table
 
     def execute_sql_statement(
diff --git a/schematic/store/synapse.py b/schematic/store/synapse.py
index 7ccd98362..93a7140f2 100644
--- a/schematic/store/synapse.py
+++ b/schematic/store/synapse.py
@@ -19,12 +19,10 @@
 import numpy as np
 import pandas as pd
 import synapseclient
-import synapseutils
 from opentelemetry import trace
 from synapseclient import Annotations as OldAnnotations
 from synapseclient import (
     Column,
-    Entity,
     EntityViewSchema,
     EntityViewType,
     File,
@@ -58,7 +56,12 @@
 from schematic.store.base import BaseStorage
 from schematic.store.database.synapse_database import SynapseDatabase
 from schematic.store.synapse_tracker import SynapseEntityTracker
-from schematic.utils.df_utils import col_in_dataframe, load_df, update_df
+from schematic.utils.df_utils import (
+    STR_NA_VALUES_FILTERED,
+    col_in_dataframe,
+    load_df,
+    update_df,
+)
 
 # entity_type_mapping, get_dir_size, create_temp_folder, check_synapse_cache_size, and clear_synapse_cache functions are used for AWS deployment
 # Please do not remove these import statements
@@ -401,7 +404,7 @@ def query_fileview(
             try:
                 self.storageFileviewTable = self.syn.tableQuery(
                     query=self.fileview_query,
-                ).asDataFrame()
+                ).asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False)
             except SynapseHTTPError as exc:
                 exception_text = str(exc)
                 if "Unknown column path" in exception_text:
@@ -416,6 +419,30 @@ def query_fileview(
                 else:
                     raise AccessCredentialsError(self.storageFileview)
 
+    @staticmethod
+    def build_clause_from_dataset_id(
+        dataset_id: Optional[str] = None, dataset_folder_list: Optional[list] = None
+    ) -> str:
+        """
+        Method to build a where clause for a Synapse FileView query based on a dataset ID that can be used before an object is initialized.
+        Args:
+            dataset_id: Synapse ID of a dataset that should be used to limit the query
+            dataset_folder_list: List of Synapse IDs of a dataset and all its subfolders that should be used to limit the query
+        Returns:
+            clause for the query or an empty string if no dataset ID is provided
+        """
+        # Calling this method without specifying synIDs will complete but will not scope the view
+        if (not dataset_id) and (not dataset_folder_list):
+            return ""
+
+        # This will be used to gather files under a dataset recursively with a fileview query instead of walking
+        if dataset_folder_list:
+            search_folders = ", ".join(f"'{synId}'" for synId in dataset_folder_list)
+            return f"parentId IN ({search_folders})"
+
+        # `dataset_id` should be provided when all files are stored directly under the dataset folder
+        return f"parentId='{dataset_id}'"
+
     def _build_query(
         self, columns: Optional[list] = None, where_clauses: Optional[list] = None
     ):
@@ -666,7 +693,7 @@ def getStorageDatasetsInProject(self, projectId: str) -> list[tuple[str, str]]:
     def getFilesInStorageDataset(
         self, datasetId: str, fileNames: List = None, fullpath: bool = True
     ) -> List[Tuple[str, str]]:
-        """Gets all files in a given dataset folder.
+        """Gets all files (excluding manifest files) in a given dataset folder.
 
         Args:
             datasetId: synapse ID of a storage dataset.
@@ -680,105 +707,58 @@ def getFilesInStorageDataset(
         Raises:
             ValueError: Dataset ID not found.
         """
-        # select all files within a given storage dataset folder (top level folder in
-        # a Synapse storage project or folder marked with contentType = 'dataset')
-        walked_path = synapseutils.walk(
-            self.syn, datasetId, includeTypes=["folder", "file"]
-        )
-
-        current_entity_location = self.synapse_entity_tracker.get(
-            synapse_id=datasetId, syn=self.syn, download_file=False
-        )
-
-        def walk_back_to_project(
-            current_location: Entity, location_prefix: str, skip_entry: bool
-        ) -> str:
-            """
-            Recursively walk back up the project structure to get the paths of the
-            names of each of the directories where we started the walk function.
-
-            Args:
-                current_location (Entity): The current entity location in the project structure.
-                location_prefix (str): The prefix to prepend to the path.
-                skip_entry (bool): Whether to skip the current entry in the path. When
-                    this is True it means we are looking at our starting point. If our
-                    starting point is the project itself we can go ahead and return
-                    back the project as the prefix.
-
-            Returns:
-                str: The path of the names of each of the directories up to the project root.
-            """
-            if (
-                skip_entry
-                and "concreteType" in current_location
-                and current_location["concreteType"] == PROJECT_ENTITY
-            ):
-                return f"{current_location.name}/{location_prefix}"
+        file_list = []
 
-            updated_prefix = (
-                location_prefix
-                if skip_entry
-                else f"{current_location.name}/{location_prefix}"
-            )
-            if (
-                "concreteType" in current_location
-                and current_location["concreteType"] == PROJECT_ENTITY
-            ):
-                return updated_prefix
-            current_location = self.synapse_entity_tracker.get(
-                synapse_id=current_location["parentId"],
-                syn=self.syn,
-                download_file=False,
+        # Get path to dataset folder by using childern to avoid cases where the dataset is the scope of the view
+        if self.storageFileviewTable.empty:
+            raise ValueError(
+                f"Fileview {self.storageFileview} is empty, please check the table and the provided synID and try again."
             )
-            return walk_back_to_project(
-                current_location=current_location,
-                location_prefix=updated_prefix,
-                skip_entry=False,
+
+        child_path = self.storageFileviewTable.loc[
+            self.storageFileviewTable["parentId"] == datasetId, "path"
+        ]
+        if child_path.empty:
+            raise LookupError(
+                f"Dataset {datasetId} could not be found in fileview {self.storageFileview}."
             )
+        child_path = child_path.iloc[0]
 
-        prefix = walk_back_to_project(
-            current_location=current_entity_location,
-            location_prefix="",
-            skip_entry=True,
-        )
+        # Get the dataset path by eliminating the child's portion of the path to account for nested datasets
+        parent = child_path.split("/")[:-1]
+        parent = "/".join(parent)
 
-        project_id = self.getDatasetProject(datasetId)
-        project = self.synapse_entity_tracker.get(
-            synapse_id=project_id, syn=self.syn, download_file=False
-        )
-        project_name = project.name
-        file_list = []
+        # Format dataset path to be used in table query
+        dataset_path = f"'{parent}/%'"
 
-        # iterate over all results
-        for dirpath, _, path_filenames in walked_path:
-            # iterate over all files in a folder
-            for path_filename in path_filenames:
-                if ("manifest" not in path_filename[0] and not fileNames) or (
-                    fileNames and path_filename[0] in fileNames
-                ):
-                    # don't add manifest to list of files unless it is specified in the
-                    # list of specified fileNames; return all found files
-                    # except the manifest if no fileNames have been specified
-                    # TODO: refactor for clarity/maintainability
-
-                    if fullpath:
-                        # append directory path to filename
-                        if dirpath[0].startswith(f"{project_name}/"):
-                            path_without_project_prefix = (
-                                dirpath[0] + "/"
-                            ).removeprefix(f"{project_name}/")
-                            path_filename = (
-                                prefix + path_without_project_prefix + path_filename[0],
-                                path_filename[1],
-                            )
-                        else:
-                            path_filename = (
-                                prefix + dirpath[0] + "/" + path_filename[0],
-                                path_filename[1],
-                            )
+        # When querying, only include files to exclude entity files and subdirectories
+        where_clauses = [f"path like {dataset_path}", "type='file'"]
+
+        # Requery the fileview to specifically get the files in the given dataset
+        self.query_fileview(columns=["id", "path"], where_clauses=where_clauses)
+
+        # Exclude manifest files
+        non_manifest_files = self.storageFileviewTable.loc[
+            ~self.storageFileviewTable["path"].str.contains("synapse_storage_manifest"),
+            :,
+        ]
+
+        # Remove all files that are not in the list of fileNames
+        if fileNames:
+            filename_regex = "|".join(fileNames)
+
+            matching_files = non_manifest_files["path"].str.contains(
+                filename_regex, case=False, regex=True
+            )
+
+            non_manifest_files = non_manifest_files.loc[matching_files, :]
 
-                    # add file name file id tuple, rearranged so that id is first and name follows
-                    file_list.append(path_filename[::-1])
+        # Truncate path if necessary
+        if not fullpath:
+            non_manifest_files.path = non_manifest_files.path.apply(os.path.basename)
+
+        # Return list of files as expected by other methods
+        file_list = list(non_manifest_files.itertuples(index=False, name=None))
 
         return file_list
 
@@ -1056,7 +1036,7 @@ def updateDatasetManifestFiles(
 
         Returns:
             Synapse ID of updated manifest and Pandas dataframe containing the updated manifest.
-            If there is no existing manifest return None
+            If there is no existing manifest or if the manifest does not have an entityId column, return None
         """
 
         # get existing manifest Synapse ID
@@ -1070,8 +1050,12 @@ def updateDatasetManifestFiles(
             synapse_id=manifest_id, syn=self.syn, download_file=True
         )
         manifest_filepath = manifest_entity.path
-
         manifest = load_df(manifest_filepath)
+
+        # If the manifest does not have an entityId column, trigger a new manifest to be generated
+        if "entityId" not in manifest.columns:
+            return None
+
         manifest_is_file_based = "Filename" in manifest.columns
 
         if manifest_is_file_based:
@@ -1079,7 +1063,6 @@ def updateDatasetManifestFiles(
             # note that if there is an existing manifest and there are files in the dataset
             # the columns Filename and entityId are assumed to be present in manifest schema
             # TODO: use idiomatic panda syntax
-
             dataset_files, manifest = self.fill_in_entity_id_filename(
                 datasetId, manifest
             )
@@ -1119,6 +1102,13 @@ def _get_file_entityIds(
                     "No manifest was passed in, a manifest is required when `only_new_files` is True."
                 )
 
+            if "entityId" not in manifest.columns:
+                raise ValueError(
+                    "The manifest in your dataset and/or top level folder must contain the 'entityId' column. "
+                    "Please generate an empty manifest without annotations, manually add annotations to the "
+                    "appropriate files in the manifest, and then try again."
+                )
+
             # find new files (that are not in the current manifest) if any
             for file_id, file_name in dataset_files:
                 if not file_id in manifest["entityId"].values:
@@ -1433,7 +1423,11 @@ def get_synapse_table(self, synapse_id: str) -> Tuple[pd.DataFrame, CsvFileTable
         """
 
         results = self.syn.tableQuery("SELECT * FROM {}".format(synapse_id))
-        df = results.asDataFrame(rowIdAndVersionInIndex=False)
+        df = results.asDataFrame(
+            rowIdAndVersionInIndex=False,
+            na_values=STR_NA_VALUES_FILTERED,
+            keep_default_na=False,
+        )
 
         return df, results
 
@@ -3485,7 +3479,11 @@ def query(self, tidy=True, force=False):
         if self.table is None or force:
             fileview_id = self.view_schema["id"]
             self.results = self.synapse.tableQuery(f"select * from {fileview_id}")
-            self.table = self.results.asDataFrame(rowIdAndVersionInIndex=False)
+            self.table = self.results.asDataFrame(
+                rowIdAndVersionInIndex=False,
+                na_values=STR_NA_VALUES_FILTERED,
+                keep_default_na=False,
+            )
         if tidy:
             self.tidy_table()
         return self.table
diff --git a/schematic/utils/cli_utils.py b/schematic/utils/cli_utils.py
index 07c97af90..c6412d349 100644
--- a/schematic/utils/cli_utils.py
+++ b/schematic/utils/cli_utils.py
@@ -4,10 +4,11 @@
 # pylint: disable=anomalous-backslash-in-string
 
 import logging
-
-from typing import Any, Mapping, Sequence, Union, Optional
-from functools import reduce
 import re
+from functools import reduce
+from typing import Any, Mapping, Optional, Sequence, Union
+
+from schematic.utils.general import SYN_ID_REGEX
 
 logger = logging.getLogger(__name__)
 
@@ -69,7 +70,7 @@ def parse_syn_ids(
     if not syn_ids:
         return None
 
-    project_regex = re.compile("(syn\d+\,?)+")
+    project_regex = re.compile(SYN_ID_REGEX)
     valid = project_regex.fullmatch(syn_ids)
 
     if not valid:
diff --git a/schematic/utils/df_utils.py b/schematic/utils/df_utils.py
index b25e7db82..ee458ba17 100644
--- a/schematic/utils/df_utils.py
+++ b/schematic/utils/df_utils.py
@@ -4,17 +4,58 @@
 
 import logging
 from copy import deepcopy
-from time import perf_counter
-from typing import Union, Any, Optional
 from datetime import datetime
+from time import perf_counter
+from typing import Any, Optional, Union
+
 import dateparser as dp
-import pandas as pd
 import numpy as np
+import pandas as pd
 from pandarallel import pandarallel  # type: ignore
 
+# pylint:disable=no-name-in-module
+from pandas._libs.parsers import STR_NA_VALUES  # type: ignore
+
+STR_NA_VALUES_FILTERED = deepcopy(STR_NA_VALUES)
+
+try:
+    STR_NA_VALUES_FILTERED.remove("None")
+except KeyError:
+    pass
+
 logger = logging.getLogger(__name__)
 
 
+def read_csv(
+    path_or_buffer: str,
+    keep_default_na: bool = False,
+    encoding: str = "utf8",
+    **load_args: Any,
+) -> pd.DataFrame:
+    """
+    A wrapper around pd.read_csv that filters out "None" from the na_values list.
+
+    Args:
+        path_or_buffer: The path to the file or a buffer containing the file.
+        keep_default_na: Whether to keep the default na_values list.
+        encoding: The encoding of the file.
+        **load_args: Additional arguments to pass to pd.read_csv.
+
+    Returns:
+        pd.DataFrame: The dataframe created from the CSV file or buffer.
+    """
+    na_values = load_args.pop(
+        "na_values", STR_NA_VALUES_FILTERED if not keep_default_na else None
+    )
+    return pd.read_csv(  # type: ignore
+        path_or_buffer,
+        na_values=na_values,
+        keep_default_na=keep_default_na,
+        encoding=encoding,
+        **load_args,
+    )
+
+
 def load_df(
     file_path: str,
     preserve_raw_input: bool = True,
@@ -45,9 +86,7 @@ def load_df(
     t_load_df = perf_counter()
 
     # Read CSV to df as type specified in kwargs
-    org_df = pd.read_csv(  # type: ignore
-        file_path, keep_default_na=True, encoding="utf8", **load_args
-    )
+    org_df = read_csv(file_path, encoding="utf8", **load_args)  # type: ignore
     if not isinstance(org_df, pd.DataFrame):
         raise ValueError(
             (
diff --git a/schematic/utils/general.py b/schematic/utils/general.py
index 0bb932aa3..cdc1be2f2 100644
--- a/schematic/utils/general.py
+++ b/schematic/utils/general.py
@@ -5,7 +5,7 @@
 import logging
 import os
 import pstats
-import subprocess
+from pathlib import Path
 import tempfile
 from cProfile import Profile
 from datetime import datetime, timedelta
@@ -24,6 +24,8 @@
 
 T = TypeVar("T")
 
+SYN_ID_REGEX = r"(syn\d+\,?)+"
+
 
 def find_duplicates(_list: list[T]) -> set[T]:
     """Find duplicate items in a list"""
@@ -129,40 +131,19 @@ def calculate_datetime(
     return date_time_result
 
 
-def check_synapse_cache_size(
-    directory: str = "/root/.synapseCache",
-) -> float:
-    """use du --sh command to calculate size of .synapseCache.
+def check_synapse_cache_size(directory: str = "/root/.synapseCache") -> float:
+    """Calculate size of .synapseCache directory in bytes using pathlib.
 
     Args:
         directory (str, optional): .synapseCache directory. Defaults to '/root/.synapseCache'
 
     Returns:
-        float: returns size of .synapsecache directory in bytes
+        float: size of .synapsecache directory in bytes
     """
-    # Note: this command might fail on windows user.
-    # But since this command is primarily for running on AWS, it is fine.
-    command = ["du", "-sh", directory]
-    output = subprocess.run(command, capture_output=True, check=False).stdout.decode(
-        "utf-8"
+    total_size = sum(
+        f.stat().st_size for f in Path(directory).rglob("*") if f.is_file()
     )
-
-    # Parsing the output to extract the directory size
-    size = output.split("\t")[0]
-    if "K" in size:
-        size_in_kb = float(size.rstrip("K"))
-        byte_size = size_in_kb * 1000
-    elif "M" in size:
-        size_in_mb = float(size.rstrip("M"))
-        byte_size = size_in_mb * 1000000
-    elif "G" in size:
-        size_in_gb = float(size.rstrip("G"))
-        byte_size = size_in_gb * (1024**3)
-    elif "B" in size:
-        byte_size = float(size.rstrip("B"))
-    else:
-        logger.error("Cannot recognize the file size unit")
-    return byte_size
+    return total_size
 
 
 def clear_synapse_cache(synapse_cache: cache.Cache, minutes: int) -> int:
diff --git a/schematic/version.py b/schematic/version.py
index a9bdb1578..cec250131 100644
--- a/schematic/version.py
+++ b/schematic/version.py
@@ -1,4 +1,3 @@
 """Sets the version of the package"""
-import importlib.metadata
-
-__version__ = importlib.metadata.version("schematic")
+# Version hardcoded see https://sagebionetworks.jira.com/browse/SCHEMATIC-229
+__version__ = "24.12.1"
diff --git a/schematic_api/api/openapi/api.yaml b/schematic_api/api/openapi/api.yaml
index 8689a8ae2..7c32e8d90 100644
--- a/schematic_api/api/openapi/api.yaml
+++ b/schematic_api/api/openapi/api.yaml
@@ -692,8 +692,8 @@ paths:
         - Synapse Storage
   /storage/dataset/files:
     get:
-      summary: Get all files in a given dataset folder
-      description: Get all files in a given dataset folder
+      summary: Get all files (excluding manifest files) in a given dataset folder
+      description: Get all files (excluding manifest files) in a given dataset folder
       operationId: schematic_api.api.routes.get_files_storage_dataset
       security:
         - access_token: []
diff --git a/schematic_api/api/routes.py b/schematic_api/api/routes.py
index 97484c15c..ad1e25279 100644
--- a/schematic_api/api/routes.py
+++ b/schematic_api/api/routes.py
@@ -20,6 +20,7 @@
 from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer
 from schematic.schemas.data_model_parser import DataModelParser
 from schematic.store.synapse import ManifestDownload, SynapseStorage
+from schematic.utils.df_utils import read_csv
 from schematic.utils.general import create_temp_folder, entity_type_mapping
 from schematic.utils.schema_utils import (
     DisplayLabelType,
@@ -178,7 +179,7 @@ def parse_bool(str_bool):
 
 
 def return_as_json(manifest_local_file_path):
-    manifest_csv = pd.read_csv(manifest_local_file_path)
+    manifest_csv = read_csv(manifest_local_file_path)
     manifest_json = manifest_csv.to_dict(orient="records")
     return manifest_json
 
diff --git a/tests/conftest.py b/tests/conftest.py
index 2f9dd3047..d373e6ae6 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -13,6 +13,7 @@
 from dotenv import load_dotenv
 from flask.testing import FlaskClient
 from opentelemetry import trace
+from opentelemetry.instrumentation.flask import FlaskInstrumentor
 from synapseclient.client import Synapse
 
 from schematic.configuration.configuration import CONFIG, Configuration
@@ -42,6 +43,8 @@
 TESTS_DIR = os.path.dirname(os.path.abspath(__file__))
 DATA_DIR = os.path.join(TESTS_DIR, "data")
 
+FlaskInstrumentor().uninstrument()
+
 
 @pytest.fixture(scope="session")
 def dataset_id():
diff --git a/tests/data/example.model.csv b/tests/data/example.model.csv
index 7438c7145..1c2923535 100644
--- a/tests/data/example.model.csv
+++ b/tests/data/example.model.csv
@@ -10,7 +10,7 @@ Cancer Type,,"Breast, Colorectal, Lung, Prostate, Skin",,,TRUE,DataProperty,,,
 Family History,,"Breast, Colorectal, Lung, Prostate, Skin",,,TRUE,DataProperty,,,list strict
 Biospecimen,,,"Sample ID, Patient ID, Tissue Status, Component",,FALSE,DataType,Patient,,
 Sample ID,,,,,TRUE,DataProperty,,,
-Tissue Status,,"Healthy, Malignant",,,TRUE,DataProperty,,,
+Tissue Status,,"Healthy, Malignant, None",,,TRUE,DataProperty,,,
 Bulk RNA-seq Assay,,,"Filename, Sample ID, File Format, Component",,FALSE,DataType,Biospecimen,,
 Filename,,,,,TRUE,DataProperty,,,#MockFilename filenameExists^^
 File Format,,"FASTQ, BAM, CRAM, CSV/TSV",,,TRUE,DataProperty,,,
@@ -24,8 +24,8 @@ Check List,,,,,TRUE,DataProperty,,,list
 Check List Enum,,"ab, cd, ef, gh",,,TRUE,DataProperty,,,list
 Check List Like,,,,,TRUE,DataProperty,,,list like
 Check List Like Enum,,"ab, cd, ef, gh",,,TRUE,DataProperty,,,list like
-Check List Strict,,,,,TRUE,DataProperty,,,list strict 
-Check List Enum Strict,,"ab, cd, ef, gh",,,TRUE,DataProperty,,,list strict 
+Check List Strict,,,,,TRUE,DataProperty,,,list strict
+Check List Enum Strict,,"ab, cd, ef, gh",,,TRUE,DataProperty,,,list strict
 Check Regex List,,,,,TRUE,DataProperty,,,list::regex match [a-f]
 Check Regex List Strict,,,,,TRUE,DataProperty,,,list strict::regex match [a-f]
 Check Regex List Like,,,,,TRUE,DataProperty,,,list like::regex match [a-f]
diff --git a/tests/data/example.model.jsonld b/tests/data/example.model.jsonld
index c4279a605..7e4560d50 100644
--- a/tests/data/example.model.jsonld
+++ b/tests/data/example.model.jsonld
@@ -540,6 +540,9 @@
                 },
                 {
                     "@id": "bts:Malignant"
+                },
+                {
+                    "@id": "bts:None"
                 }
             ],
             "sms:displayName": "Tissue Status",
@@ -563,6 +566,23 @@
             "sms:required": "sms:false",
             "sms:validationRules": []
         },
+        {
+            "@id": "bts:None",
+            "@type": "rdfs:Class",
+            "rdfs:comment": "TBD",
+            "rdfs:label": "None",
+            "rdfs:subClassOf": [
+                {
+                    "@id": "bts:TissueStatus"
+                }
+            ],
+            "schema:isPartOf": {
+                "@id": "http://schema.biothings.io"
+            },
+            "sms:displayName": "None",
+            "sms:required": "sms:false",
+            "sms:validationRules": []
+        },
         {
             "@id": "bts:BulkRNA-seqAssay",
             "@type": "rdfs:Class",
diff --git a/tests/data/mock_manifests/Invalid_none_value_test_manifest.csv b/tests/data/mock_manifests/Invalid_none_value_test_manifest.csv
new file mode 100644
index 000000000..8230442f3
--- /dev/null
+++ b/tests/data/mock_manifests/Invalid_none_value_test_manifest.csv
@@ -0,0 +1,6 @@
+Sample ID,Patient ID,Tissue Status,Component
+1,1,Healthy,Biospecimen
+2,2,Malignant,Biospecimen
+3,3,None,Biospecimen
+4,4,None,Biospecimen
+5,5,InvalidValue,Biospecimen
diff --git a/tests/data/mock_manifests/Valid_none_value_test_manifest.csv b/tests/data/mock_manifests/Valid_none_value_test_manifest.csv
new file mode 100644
index 000000000..b7a0666d6
--- /dev/null
+++ b/tests/data/mock_manifests/Valid_none_value_test_manifest.csv
@@ -0,0 +1,6 @@
+Sample ID,Patient ID,Tissue Status,Component
+1,1,Healthy,Biospecimen
+2,2,Malignant,Biospecimen
+3,3,None,Biospecimen
+4,4,None,Biospecimen
+5,5,None,Biospecimen
\ No newline at end of file
diff --git a/tests/integration/test_commands.py b/tests/integration/test_commands.py
index 8658f6e6b..342736ab1 100644
--- a/tests/integration/test_commands.py
+++ b/tests/integration/test_commands.py
@@ -4,16 +4,16 @@
 import uuid
 from io import BytesIO
 
+import numpy as np
 import pytest
 import requests
-from openpyxl import load_workbook
 from click.testing import CliRunner
-import pandas as pd
-import numpy as np
+from openpyxl import load_workbook
 
-from schematic.configuration.configuration import Configuration, CONFIG
+from schematic.configuration.configuration import CONFIG, Configuration
 from schematic.manifest.commands import manifest
 from schematic.models.commands import model
+from schematic.utils.df_utils import read_csv
 from tests.conftest import ConfigurationForTesting
 
 LIGHT_BLUE = "FFEAF7F9"  # Required cell
@@ -95,14 +95,14 @@ def test_validate_valid_manifest(self, runner: CliRunner) -> None:
         # command has no (python) errors, has exit code 0
         assert result.exit_code == 0
         # command output has success message
-        assert result.output.split("\n")[4] == (
+        result_list = result.output.split("\n")
+        assert (
             "Your manifest has been validated successfully. "
             "There are no errors in your manifest, "
             "and it can be submitted without any modifications."
-        )
+        ) in result_list
         # command output has no validation errors
-        for line in result.output.split("\n")[4]:
-            assert not line.startswith("error")
+        errors = [errors for result in result_list if result.startswith("error")]
 
     def test_validate_invalid_manifest(self, runner: CliRunner) -> None:
         """
@@ -155,8 +155,8 @@ def test_generate_empty_csv_manifests(self, runner: CliRunner) -> None:
             # command has no (python) errors, has exit code 0
             assert result.exit_code == 0
 
-            biospecimen_df = pd.read_csv("tests/data/example.Biospecimen.manifest.csv")
-            patient_df = pd.read_csv("tests/data/example.Patient.manifest.csv")
+            biospecimen_df = read_csv("tests/data/example.Biospecimen.manifest.csv")
+            patient_df = read_csv("tests/data/example.Patient.manifest.csv")
 
         # Remove created files:
         finally:
@@ -339,7 +339,7 @@ def test_generate_empty_google_sheet_manifests(
             assert False, f"Unexpected data validation found: {dv}"
         assert tissue_status_validation is not None
         assert tissue_status_validation.type == "list"
-        assert tissue_status_validation.formula1 == "Sheet2!$C$2:$C$3"
+        assert tissue_status_validation.formula1 == "Sheet2!$C$2:$C$4"
 
         # required fields are marked as “light blue”, while other non-required fields are marked as white.
         for col in ["Sample ID", "Patient ID", "Tissue Status", "Component"]:
@@ -504,9 +504,10 @@ def test_generate_empty_excel_manifest(
                 os.remove("tests/data/example.Biospecimen.schema.json")
 
         # command output has excel file creation message
+        result_list = result.output.split("\n")
         assert (
-            result.output.split("\n")[7]
-            == "Find the manifest template using this Excel file path: ./CLI_empty_excel.xlsx"
+            "Find the manifest template using this Excel file path: ./CLI_empty_excel.xlsx"
+            in result_list
         )
 
         sheet1 = workbook["Sheet1"]
@@ -665,18 +666,19 @@ def test_generate_bulk_rna_google_sheet_manifest(
             # Reset config to it's default values
             CONFIG.load_config("config_example.yml")
 
-        assert result.output.split("\n")[7] == (
-            "Find the manifest template using this Google Sheet URL:"
-        )
-        assert result.output.split("\n")[8].startswith(
-            "https://docs.google.com/spreadsheets/d/"
-        )
-        assert result.output.split("\n")[9] == (
+        result_list = result.output.split("\n")
+        assert "Find the manifest template using this Google Sheet URL:" in result_list
+        assert (
             "Find the manifest template using this CSV file path: "
             "./CLI_gs_bulk_rna.csv"
-        )
-
-        google_sheet_url = result.output.split("\n")[8]
+        ) in result_list
+        google_sheet_result = [
+            result
+            for result in result_list
+            if result.startswith("https://docs.google.com/spreadsheets/d/")
+        ]
+        assert len(google_sheet_result) == 1
+        google_sheet_url = google_sheet_result[0]
 
         # Download the Google Sheets content as an Excel file and load into openpyxl
         export_url = f"{google_sheet_url}/export?format=xlsx"
@@ -908,18 +910,19 @@ def test_generate_bulk_rna_google_sheet_manifest_with_annotations(
                 os.remove("tests/data/example.BulkRNA-seqAssay.schema.json")
             os.remove("./CLI_gs_bulk_rna_annos.csv")
 
-        assert result.output.split("\n")[10] == (
-            "Find the manifest template using this Google Sheet URL:"
-        )
-        assert result.output.split("\n")[11].startswith(
-            "https://docs.google.com/spreadsheets/d/"
-        )
-        assert result.output.split("\n")[12] == (
+        result_list = result.output.split("\n")
+        assert "Find the manifest template using this Google Sheet URL:" in result_list
+        assert (
             "Find the manifest template using this CSV file path: "
             "./CLI_gs_bulk_rna_annos.csv"
-        )
-
-        google_sheet_url = result.output.split("\n")[11]
+        ) in result_list
+        google_sheet_result = [
+            result
+            for result in result_list
+            if result.startswith("https://docs.google.com/spreadsheets/d/")
+        ]
+        assert len(google_sheet_result) == 1
+        google_sheet_url = google_sheet_result[0]
 
         # Download the Google Sheets content as an Excel file and load into openpyxl
         export_url = f"{google_sheet_url}/export?format=xlsx"
@@ -1177,10 +1180,11 @@ def test_generate_mock_component_excel_manifest(self, runner: CliRunner) -> None
             # TODO: remove with https://sagebionetworks.jira.com/browse/SCHEMATIC-202
             # Reset config to it's default values
             CONFIG.load_config("config_example.yml")
-
         # Command output has excel file message
-        assert result.output.split("\n")[8] == (
+        result_list = result.output.split("\n")
+        assert (
             "Find the manifest template using this Excel file path: ./CLI_mock_comp.xlsx"
+            in result_list
         )
 
         sheet1 = workbook["Sheet1"]
diff --git a/tests/integration/test_manifest_submission.py b/tests/integration/test_manifest_submission.py
index 92e6911c1..84581f7d8 100644
--- a/tests/integration/test_manifest_submission.py
+++ b/tests/integration/test_manifest_submission.py
@@ -4,7 +4,6 @@
 import uuid
 from typing import Any, Callable, Dict
 
-import pandas as pd
 import pytest
 import requests
 from flask.testing import FlaskClient
@@ -12,6 +11,7 @@
 
 from schematic.configuration.configuration import CONFIG
 from schematic.store.synapse import SynapseStorage
+from schematic.utils.df_utils import read_csv
 from tests.conftest import ConfigurationForTesting, Helpers
 from tests.utils import CleanupItem
 
@@ -73,7 +73,7 @@ def validate_submitted_manifest_file(
         manifest_file_path = os.path.join(
             download_location, manifest_data["properties"]["name"]
         )
-        manifest_submitted_df = pd.read_csv(manifest_file_path)
+        manifest_submitted_df = read_csv(manifest_file_path)
         assert "entityId" in manifest_submitted_df.columns
         assert "Id" in manifest_submitted_df.columns
 
@@ -1035,6 +1035,11 @@ def test_submit_manifest_with_hide_blanks(
         randomized_annotation_content = str(uuid.uuid4())
         df["RandomizedAnnotation"] = randomized_annotation_content
 
+        # AND a "None" string remains in the manifest
+        df["NoneString"] = "None"
+        df["NoneString1"] = "none"
+        df["NoneString2"] = "NoNe"
+
         with tempfile.NamedTemporaryFile(delete=True, suffix=".csv") as tmp_file:
             # Write the DF to a temporary file
             df.to_csv(tmp_file.name, index=False)
@@ -1070,6 +1075,9 @@ def test_submit_manifest_with_hide_blanks(
         modified_file = syn.get(df["entityId"][0], downloadFile=False)
         assert modified_file is not None
         assert modified_file["RandomizedAnnotation"][0] == randomized_annotation_content
+        assert modified_file["NoneString"][0] == "None"
+        assert modified_file["NoneString1"][0] == "none"
+        assert modified_file["NoneString2"][0] == "NoNe"
 
         # AND the blank annotations are not present
         assert "Genome Build" not in modified_file
diff --git a/tests/integration/test_metadata_model.py b/tests/integration/test_metadata_model.py
index 2178a83b8..21da9cfd8 100644
--- a/tests/integration/test_metadata_model.py
+++ b/tests/integration/test_metadata_model.py
@@ -1,5 +1,5 @@
 """
-This script contains a test suite for verifying the submission and annotation of
+This script contains a test suite for verifying the validation, submission and annotation of
 file-based manifests using the `TestMetadataModel` class to communicate with Synapse
 and verify the expected behavior of uploading annotation manifest CSVs using the
 metadata model.
@@ -13,7 +13,7 @@
 import tempfile
 import uuid
 from contextlib import nullcontext as does_not_raise
-from typing import Callable, Optional
+from typing import Any, Callable, Optional
 
 import pandas as pd
 import pytest
@@ -23,6 +23,7 @@
 from synapseclient.models import File, Folder
 
 from schematic.store.synapse import SynapseStorage
+from schematic.utils.df_utils import STR_NA_VALUES_FILTERED
 from schematic.utils.general import create_temp_folder
 from tests.conftest import Helpers, metadata_model
 from tests.utils import CleanupItem
@@ -139,7 +140,6 @@ async def test_submit_filebased_manifest_file_and_entities_valid_manifest_submit
             dir=create_temp_folder(path=tempfile.gettempdir()),
         ) as tmp_file:
             df.to_csv(tmp_file.name, index=False)
-
             # WHEN the manifest is submitted (Assertions are handled in the helper method)
             self._submit_and_verify_manifest(
                 helpers=helpers,
@@ -229,7 +229,6 @@ async def test_submit_filebased_manifest_file_and_entities_mock_filename(
             dir=create_temp_folder(path=tempfile.gettempdir()),
         ) as tmp_file:
             df.to_csv(tmp_file.name, index=False)
-
             # WHEN the manifest is submitted (Assertions are handled in the helper method)
             self._submit_and_verify_manifest(
                 helpers=helpers,
@@ -450,6 +449,11 @@ def _submit_and_verify_manifest(
             raise ValueError(
                 "expected_manifest_id or expected_manifest_name must be provided"
             )
+        # HACK: must requery the fileview to get new files, since SynapseStorage will query the last state
+        # of the fileview which may not contain any new folders in the fileview.
+        # This is a workaround to fileviews not always containing the latest information
+        # Since the tests don't always follow a similar process as testing resources are created and destroyed
+        synapse_store.query_fileview(force_requery=True)
 
         # Spies
         if already_spied:
@@ -531,7 +535,7 @@ def _submit_and_verify_manifest(
                 )
                 manifest_table = synapse_store.syn.tableQuery(
                     f"select * from {expected_table_id}", downloadLocation=download_dir
-                ).asDataFrame()
+                ).asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False)
 
                 # AND the columns in the manifest table should reflect the ones in the file
                 table_columns = manifest_table.columns
@@ -566,3 +570,46 @@ def _submit_and_verify_manifest(
                 spy_upload_file_as_table.call_count == 1
             spy_upload_file_as_csv.assert_not_called()
             spy_upload_file_combo.assert_not_called()
+
+    def test_validate_model_manifest_valid_with_none_string(
+        self, helpers: Helpers
+    ) -> None:
+        """
+        Tests for validateModelManifest when the manifest is valid with 'None' values
+
+        Args:
+            helpers: Test helper functions
+        """
+        meta_data_model = metadata_model(helpers, "class_label")
+
+        errors, warnings = meta_data_model.validateModelManifest(
+            manifestPath="tests/data/mock_manifests/Valid_none_value_test_manifest.csv",
+            rootNode="Biospecimen",
+        )
+        assert not warnings
+        assert not errors
+
+    def test_validate_model_manifest_invalid(self, helpers: Helpers) -> None:
+        """
+        Tests for validateModelManifest when the manifest requires values to be from a set of values containing 'None'
+
+        Args:
+            helpers: Test helper functions
+        """
+        meta_data_model = metadata_model(helpers, "class_label")
+
+        errors, warnings = meta_data_model.validateModelManifest(
+            manifestPath="tests/data/mock_manifests/Invalid_none_value_test_manifest.csv",
+            rootNode="Biospecimen",
+        )
+        assert not warnings
+        assert errors[0][0] == "6"
+        assert errors[0][1] == "Tissue Status"
+        assert errors[0][3] == "InvalidValue"
+        # The order of the valid values in the error message are random, so the test must be
+        # slightly complicated:
+        # 'InvalidValue' is not one of ['Malignant', 'Healthy', 'None']
+        # 'InvalidValue' is not one of ['Healthy', 'Malignant', 'None']
+        error_message = errors[0][2]
+        assert isinstance(error_message, str)
+        assert error_message.startswith("'InvalidValue' is not one of")
diff --git a/tests/integration/test_store_synapse.py b/tests/integration/test_store_synapse.py
index 085a48fc3..5c39acc3c 100644
--- a/tests/integration/test_store_synapse.py
+++ b/tests/integration/test_store_synapse.py
@@ -1,10 +1,17 @@
-from unittest.mock import MagicMock
+from re import compile as re_compile
+from unittest.mock import MagicMock, create_autospec, patch
 
 import numpy as np
+import pandas as pd
 import pytest
+from synapseclient import Synapse
+from synapseclient.core.cache import Cache
+from synapseclient.table import build_table
 
+from schematic.configuration.configuration import Configuration
 from schematic.schemas.data_model_graph import DataModelGraphExplorer
 from schematic.store.synapse import SynapseStorage
+from schematic.utils.general import SYN_ID_REGEX
 from schematic.utils.validate_utils import comma_separated_list_regex
 
 
@@ -125,3 +132,327 @@ def test_process_row_annotations_get_validation(
             "value2",
             "value3",
         ]
+
+    @pytest.mark.parametrize(
+        "asset_view, dataset_id, expected_files",
+        [
+            (
+                "syn23643253",
+                "syn61374924",
+                [
+                    ("syn61374926", "schematic - main/BulkRNASeq and files/txt1.txt"),
+                    ("syn61374930", "schematic - main/BulkRNASeq and files/txt2.txt"),
+                    ("syn62282794", "schematic - main/BulkRNASeq and files/txt3.txt"),
+                    ("syn62282720", "schematic - main/BulkRNASeq and files/txt4.txt"),
+                ],
+            ),
+            (
+                "syn23643253",
+                "syn25614635",
+                [
+                    (
+                        "syn25614636",
+                        "schematic - main/TestDatasets/TestDataset-Annotations-v3/Sample_A.txt",
+                    ),
+                    (
+                        "syn25614637",
+                        "schematic - main/TestDatasets/TestDataset-Annotations-v3/Sample_B.txt",
+                    ),
+                    (
+                        "syn25614638",
+                        "schematic - main/TestDatasets/TestDataset-Annotations-v3/Sample_C.txt",
+                    ),
+                ],
+            ),
+            (
+                "syn63917487",
+                "syn63917494",
+                [
+                    (
+                        "syn63917520",
+                        "schematic - main/Test files and dataset annotations/Test BulkRNAseq w annotation/txt1.txt",
+                    ),
+                    (
+                        "syn63917521",
+                        "schematic - main/Test files and dataset annotations/Test BulkRNAseq w annotation/txt2.txt",
+                    ),
+                    (
+                        "syn63917522",
+                        "schematic - main/Test files and dataset annotations/Test BulkRNAseq w annotation/txt3.txt",
+                    ),
+                    (
+                        "syn63917518",
+                        "schematic - main/Test files and dataset annotations/Test BulkRNAseq w annotation/txt4.txt",
+                    ),
+                ],
+            ),
+            (
+                "syn23643253",
+                "syn63927665",
+                [
+                    (
+                        "syn63927671",
+                        "schematic - main/BulkRNAseq nested files/data/txt1.txt",
+                    ),
+                    (
+                        "syn63927672",
+                        "schematic - main/BulkRNAseq nested files/data/txt2.txt",
+                    ),
+                    (
+                        "syn63927673",
+                        "schematic - main/BulkRNAseq nested files/data/txt3.txt",
+                    ),
+                    (
+                        "syn63927670",
+                        "schematic - main/BulkRNAseq nested files/data/txt4.txt",
+                    ),
+                ],
+            ),
+            (
+                "syn23643253",
+                "syn63987067",
+                [
+                    (
+                        "syn63987072",
+                        "schematic - main/BulkRNAseq and double nested files/dataset/folder 1/data/txt1.txt",
+                    ),
+                    (
+                        "syn63987073",
+                        "schematic - main/BulkRNAseq and double nested files/dataset/folder 1/data/txt2.txt",
+                    ),
+                    (
+                        "syn63987074",
+                        "schematic - main/BulkRNAseq and double nested files/dataset/folder 1/data/txt3.txt",
+                    ),
+                    (
+                        "syn63987071",
+                        "schematic - main/BulkRNAseq and double nested files/dataset/folder 1/data/txt4.txt",
+                    ),
+                ],
+            ),
+        ],
+        ids=[
+            "top level folder",
+            "nested dataset",
+            "contentType:dataset annotation",
+            "nested data files",
+            "doubly nested data files",
+        ],
+    )
+    @pytest.mark.parametrize(
+        "filenames",
+        [None, ["txt1.txt", "txt2.txt"]],
+        ids=["no file filtering", "file filtering"],
+    )
+    def test_get_files_in_storage_dataset(
+        self, filenames, asset_view, dataset_id, expected_files, synapse_store
+    ):
+        # GIVEN a SynapseStorage object with the appropriate asset view
+        synapse_store.storageFileview = asset_view
+        # WHEN getFilesInStorageDataset is called for the given dataset
+        dataset_files = synapse_store.getFilesInStorageDataset(
+            datasetId=dataset_id, fileNames=filenames
+        )
+        # AND the filenames are filtered as appropriate
+        if filenames:
+            files_to_remove = []
+            for f in expected_files:
+                retain = False
+                for name in filenames:
+                    if name in f[1]:
+                        retain = True
+                if not retain:
+                    files_to_remove.append(f)
+
+            for file in files_to_remove:
+                expected_files.remove(file)
+
+        # THEN the expected files are returned
+        # AND there are no unexpected files
+        assert dataset_files == expected_files
+        # AND the (synId, path) order is correct
+        synapse_id_regex = re_compile(SYN_ID_REGEX)
+        if dataset_files:
+            assert synapse_id_regex.fullmatch(dataset_files[0][0])
+
+    @pytest.mark.parametrize(
+        "asset_view, dataset_id, exception, exception_message",
+        [
+            (
+                "syn23643253",
+                "syn64319379",
+                LookupError,
+                "Dataset syn64319379 could not be found",
+            ),
+            ("syn64367119", "syn61374924", ValueError, "Fileview syn64367119 is empty"),
+        ],
+        ids=[
+            "empty dataset",
+            "empty view",
+        ],
+    )
+    def test_get_files_in_storage_dataset_exception(
+        self, asset_view, dataset_id, exception, exception_message, synapse_store
+    ):
+        # GIVEN a SynapseStorage object with the appropriate asset view
+        synapse_store.storageFileview = asset_view
+        # AND the correct and up-to-date fileview
+        synapse_store.query_fileview(force_requery=True)
+        # WHEN getFilesInStorageDataset is called
+        # THEN the appropriate exception should be raised, with the appropriate message
+        with pytest.raises(exception, match=exception_message):
+            synapse_store.getFilesInStorageDataset(datasetId=dataset_id, fileNames=None)
+
+    @pytest.mark.parametrize(
+        "asset_view, dataset_id, expected_files",
+        [
+            (
+                "syn23643253",
+                "syn61374924",
+                [
+                    ("syn61374926", "schematic - main/BulkRNASeq and files/txt1.txt"),
+                    ("syn61374930", "schematic - main/BulkRNASeq and files/txt2.txt"),
+                    ("syn62282794", "schematic - main/BulkRNASeq and files/txt3.txt"),
+                    ("syn62282720", "schematic - main/BulkRNASeq and files/txt4.txt"),
+                ],
+            ),
+            (
+                "syn23643253",
+                "syn25614635",
+                [
+                    (
+                        "syn25614636",
+                        "schematic - main/TestDatasets/TestDataset-Annotations-v3/Sample_A.txt",
+                    ),
+                    (
+                        "syn25614637",
+                        "schematic - main/TestDatasets/TestDataset-Annotations-v3/Sample_B.txt",
+                    ),
+                    (
+                        "syn25614638",
+                        "schematic - main/TestDatasets/TestDataset-Annotations-v3/Sample_C.txt",
+                    ),
+                ],
+            ),
+            (
+                "syn63917487",
+                "syn63917494",
+                [
+                    (
+                        "syn63917520",
+                        "schematic - main/Test files and dataset annotations/Test BulkRNAseq w annotation/txt1.txt",
+                    ),
+                    (
+                        "syn63917521",
+                        "schematic - main/Test files and dataset annotations/Test BulkRNAseq w annotation/txt2.txt",
+                    ),
+                    (
+                        "syn63917522",
+                        "schematic - main/Test files and dataset annotations/Test BulkRNAseq w annotation/txt3.txt",
+                    ),
+                    (
+                        "syn63917518",
+                        "schematic - main/Test files and dataset annotations/Test BulkRNAseq w annotation/txt4.txt",
+                    ),
+                ],
+            ),
+            (
+                "syn23643253",
+                "syn63927665",
+                [
+                    (
+                        "syn63927671",
+                        "schematic - main/BulkRNAseq nested files/data/txt1.txt",
+                    ),
+                    (
+                        "syn63927672",
+                        "schematic - main/BulkRNAseq nested files/data/txt2.txt",
+                    ),
+                    (
+                        "syn63927673",
+                        "schematic - main/BulkRNAseq nested files/data/txt3.txt",
+                    ),
+                    (
+                        "syn63927670",
+                        "schematic - main/BulkRNAseq nested files/data/txt4.txt",
+                    ),
+                ],
+            ),
+            (
+                "syn23643253",
+                "syn63987067",
+                [
+                    (
+                        "syn63987072",
+                        "schematic - main/BulkRNAseq and double nested files/dataset/folder 1/data/txt1.txt",
+                    ),
+                    (
+                        "syn63987073",
+                        "schematic - main/BulkRNAseq and double nested files/dataset/folder 1/data/txt2.txt",
+                    ),
+                    (
+                        "syn63987074",
+                        "schematic - main/BulkRNAseq and double nested files/dataset/folder 1/data/txt3.txt",
+                    ),
+                    (
+                        "syn63987071",
+                        "schematic - main/BulkRNAseq and double nested files/dataset/folder 1/data/txt4.txt",
+                    ),
+                ],
+            ),
+        ],
+        ids=[
+            "top level folder",
+            "nested dataset",
+            "contentType:dataset annotation",
+            "nested data files",
+            "doubly nested data files",
+        ],
+    )
+    @pytest.mark.parametrize(
+        "filenames",
+        [None, ["txt1.txt", "txt2.txt"]],
+        ids=["no file filtering", "file filtering"],
+    )
+    def test_mock_get_files_in_storage_dataset(
+        self,
+        synapse_store,
+        filenames,
+        asset_view,
+        dataset_id,
+        expected_files,
+    ):
+        # GIVEN a test configuration
+        TEST_CONFIG = Configuration()
+
+        with patch(
+            "schematic.store.synapse.CONFIG", return_value=TEST_CONFIG
+        ) as mock_config:
+            # AND the appropriate asset view
+            mock_config.synapse_master_fileview_id = asset_view
+            # AND the appropriately filtered filenames
+            if filenames:
+                files_to_remove = []
+                for f in expected_files:
+                    retain = False
+                    for name in filenames:
+                        if name in f[1]:
+                            retain = True
+                    if not retain:
+                        files_to_remove.append(f)
+
+                for file in files_to_remove:
+                    expected_files.remove(file)
+
+            # WHEN getFilesInStorageDataset is called for the given dataset
+            dataset_files = synapse_store.getFilesInStorageDataset(
+                datasetId=dataset_id, fileNames=filenames
+            )
+
+            # THEN the expected files are returned
+            # AND there are no unexpected files
+            assert dataset_files == expected_files
+            # AND the (synId, path) order is correct
+            synapse_id_regex = re_compile(SYN_ID_REGEX)
+            if dataset_files:
+                assert synapse_id_regex.fullmatch(dataset_files[0][0])
diff --git a/tests/test_api.py b/tests/test_api.py
index 1f2d79add..842210d5b 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -14,11 +14,11 @@
 from flask.testing import FlaskClient
 from opentelemetry import trace
 
-from schematic.configuration.configuration import Configuration
+from schematic.configuration.configuration import CONFIG, Configuration
 from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer
 from schematic.schemas.data_model_parser import DataModelParser
+from schematic.utils.df_utils import read_csv
 from schematic.utils.general import create_temp_folder
-from schematic.configuration.configuration import CONFIG
 
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -838,7 +838,7 @@ def test_generate_manifest_file_based_annotations(
         response_google_sheet = json.loads(response.data)
 
         # open the google sheet
-        google_sheet_df = pd.read_csv(
+        google_sheet_df = read_csv(
             response_google_sheet[0] + "/export?gid=0&format=csv"
         )
 
@@ -894,7 +894,7 @@ def test_generate_manifest_not_file_based_with_annotations(
         response_google_sheet = json.loads(response.data)
 
         # open the google sheet
-        google_sheet_df = pd.read_csv(
+        google_sheet_df = read_csv(
             response_google_sheet[0] + "/export?gid=0&format=csv"
         )
 
diff --git a/tests/test_manifest.py b/tests/test_manifest.py
index ade80fbe9..2358019d9 100644
--- a/tests/test_manifest.py
+++ b/tests/test_manifest.py
@@ -5,6 +5,7 @@
 
 import pandas as pd
 import pytest
+from pandas.testing import assert_series_equal
 
 from schematic.configuration.configuration import Configuration
 from schematic.manifest.generator import ManifestGenerator
@@ -103,6 +104,9 @@ def mock_create_blank_google_sheet():
 
 @pytest.fixture(params=[True, False], ids=["sheet_url", "data_frame"])
 def manifest(dataset_id, manifest_generator, request):
+    """
+    Only seems to be used for TestManifestGenerator.test_get_manifest_first_time
+    """
     # Rename request param for readability
     sheet_url = request.param
 
@@ -763,13 +767,11 @@ def test_create_manifests(
             assert all_results == expected_result
 
     @pytest.mark.parametrize(
-        "component,datasetId,expected_file_based,expected_rows,expected_files",
+        "component, datasetId, expected_rows, expected_files, expected_annotations",
         [
-            ("Biospecimen", "syn61260107", False, 4, None),
             (
                 "BulkRNA-seqAssay",
                 "syn61374924",
-                True,
                 4,
                 pd.Series(
                     [
@@ -780,18 +782,50 @@ def test_create_manifests(
                     ],
                     name="Filename",
                 ),
+                {
+                    "File Format": pd.Series(
+                        ["BAM", "CRAM", "CSV/TSV", ""], name="File Format"
+                    ),
+                    "Genome Build": pd.Series(
+                        ["GRCh37", "GRCh38", "GRCm38", ""], name="Genome Build"
+                    ),
+                },
+            ),
+            (
+                "BulkRNA-seqAssay",
+                "syn25614635",
+                3,
+                pd.Series(
+                    [
+                        "schematic - main/TestDatasets/TestDataset-Annotations-v3/Sample_A.txt",
+                        "schematic - main/TestDatasets/TestDataset-Annotations-v3/Sample_B.txt",
+                        "schematic - main/TestDatasets/TestDataset-Annotations-v3/Sample_C.txt",
+                    ],
+                    name="Filename",
+                ),
+                {
+                    "File Format": pd.Series(
+                        ["txt", "csv", "fastq"], name="File Format"
+                    ),
+                    "Year of Birth": pd.Series(["1980", "", ""], name="Year of Birth"),
+                },
             ),
         ],
-        ids=["Record based", "File based"],
+        ids=[
+            "top level folder",
+            "nested dataset",
+        ],
     )
+    @pytest.mark.parametrize("use_annotations", [True, False])
     def test_get_manifest_with_files(
         self,
         helpers,
         component,
         datasetId,
-        expected_file_based,
         expected_rows,
         expected_files,
+        expected_annotations,
+        use_annotations,
     ):
         """
         Test to ensure that
@@ -813,27 +847,37 @@ def test_get_manifest_with_files(
             path_to_data_model=path_to_data_model,
             graph=graph_data_model,
             root=component,
-            use_annotations=True,
+            use_annotations=use_annotations,
         )
 
-        # WHEN a manifest is generated for the appropriate dataset as a dataframe
+        # WHEN a filebased manifest is generated for the appropriate dataset as a dataframe
         manifest = generator.get_manifest(
             dataset_id=datasetId, output_format="dataframe"
         )
 
-        # AND it is determined if the manifest is filebased
-        is_file_based = "Filename" in manifest.columns
-
         # AND the number of rows are checked
         n_rows = manifest.shape[0]
 
         # THEN the manifest should have the expected number of rows
         assert n_rows == expected_rows
 
-        # AND the manifest should be filebased or not as expected
-        assert is_file_based == expected_file_based
+        # AND the manifest should have the columns expected of filebased metadata
+        assert "Filename" in manifest.columns
+        assert "entityId" in manifest.columns
+
+        # AND the manifest should have the expected files from the dataset
+        assert_series_equal(
+            manifest["Filename"],
+            expected_files,
+            check_dtype=False,
+        )
 
-        # AND if the manifest is file based
-        if expected_file_based:
-            # THEN the manifest should have the expected files
-            assert manifest["Filename"].equals(expected_files)
+        # AND if annotations are used to generate the manifest
+        if use_annotations:
+            # THEN the annotations in the generated manifest should match the expected annotations
+            for attribute, annotations in expected_annotations.items():
+                assert_series_equal(
+                    manifest[attribute],
+                    annotations,
+                    check_dtype=False,
+                )
diff --git a/tests/test_schemas.py b/tests/test_schemas.py
index 409c653b0..670aa2474 100644
--- a/tests/test_schemas.py
+++ b/tests/test_schemas.py
@@ -462,7 +462,7 @@ def test_generate_data_model_graph(self, helpers, data_model, data_model_labels)
         )
 
         # Check Edge directions
-        assert 4 == (len(graph.out_edges("TissueStatus")))
+        assert 6 == (len(graph.out_edges("TissueStatus")))
         assert 2 == (len(graph.in_edges("TissueStatus")))
 
 
diff --git a/tests/test_store.py b/tests/test_store.py
index c9f32bec2..49b79a3c0 100644
--- a/tests/test_store.py
+++ b/tests/test_store.py
@@ -11,7 +11,7 @@
 import uuid
 from contextlib import nullcontext as does_not_raise
 from typing import Any, Callable, Generator
-from unittest.mock import AsyncMock, patch
+from unittest.mock import AsyncMock, MagicMock, patch
 
 import pandas as pd
 import pytest
@@ -22,12 +22,14 @@
 from synapseclient.entity import File, Project
 from synapseclient.models import Annotations
 from synapseclient.models import Folder as FolderModel
+from synapseclient.table import build_table
 
 from schematic.configuration.configuration import CONFIG, Configuration
 from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer
 from schematic.schemas.data_model_parser import DataModelParser
 from schematic.store.base import BaseStorage
 from schematic.store.synapse import DatasetFileView, ManifestDownload, SynapseStorage
+from schematic.utils.df_utils import STR_NA_VALUES_FILTERED
 from schematic.utils.general import check_synapse_cache_size, create_temp_folder
 from tests.conftest import Helpers
 from tests.utils import CleanupItem
@@ -463,46 +465,103 @@ def test_getDatasetProject(self, dataset_id, synapse_store):
             (
                 True,
                 [
-                    ("syn126", "schematic - main/parent_folder/test_file"),
+                    ("syn126", "syn_mock", "schematic - main/parent_folder/test_file"),
                     (
                         "syn125",
+                        "syn_mock",
                         "schematic - main/parent_folder/test_folder/test_file_2",
                     ),
                 ],
             ),
-            (False, [("syn126", "test_file"), ("syn125", "test_file_2")]),
+            (
+                False,
+                [
+                    ("syn126", "syn_mock", "test_file"),
+                    ("syn125", "syn_mock", "test_file_2"),
+                ],
+            ),
         ],
     )
     def test_getFilesInStorageDataset(self, synapse_store, full_path, expected):
-        mock_return = [
+        mock_table_dataframe_return = pd.DataFrame(
+            {
+                "id": ["syn126", "syn125"],
+                "parentId": ["syn_mock", "syn_mock"],
+                "path": [
+                    "schematic - main/parent_folder/test_file",
+                    "schematic - main/parent_folder/test_folder/test_file_2",
+                ],
+            }
+        )
+
+        with patch.object(
+            synapse_store, "storageFileviewTable", mock_table_dataframe_return
+        ), patch.object(synapse_store, "query_fileview") as mocked_query:
+            # query_fileview is the function called to get the fileview
+            mocked_query.return_value = mock_table_dataframe_return
+
+            file_list = synapse_store.getFilesInStorageDataset(
+                datasetId="syn_mock", fileNames=None, fullpath=full_path
+            )
+            assert file_list == expected
+
+    @pytest.mark.parametrize(
+        "mock_table_dataframe, raised_exception, exception_message",
+        [
             (
-                ("parent_folder", "syn123"),
-                [("test_folder", "syn124")],
-                [("test_file", "syn126")],
+                pd.DataFrame(
+                    {
+                        "id": ["child_syn_mock"],
+                        "path": ["schematic - main/parent_folder/child_entity"],
+                        "parentId": ["wrong_syn_mock"],
+                    }
+                ),
+                LookupError,
+                "Dataset syn_mock could not be found",
             ),
             (
-                (
-                    os.path.join("schematic - main", "parent_folder", "test_folder"),
-                    "syn124",
+                pd.DataFrame(
+                    {
+                        "id": [],
+                        "path": [],
+                        "parentId": [],
+                    }
                 ),
-                [],
-                [("test_file_2", "syn125")],
+                ValueError,
+                "Fileview mock_table_id is empty",
             ),
-        ]
-        with patch(
-            "synapseutils.walk_functions._help_walk", return_value=mock_return
-        ) as mock_walk_patch, patch(
-            "schematic.store.synapse.SynapseStorage.getDatasetProject",
-            return_value="syn23643250",
-        ) as mock_project_id_patch, patch(
-            "synapseclient.entity.Entity.__getattr__", return_value="schematic - main"
-        ) as mock_project_name_patch, patch.object(
-            synapse_store.syn, "get", return_value=Project(name="schematic - main")
-        ):
-            file_list = synapse_store.getFilesInStorageDataset(
-                datasetId="syn_mock", fileNames=None, fullpath=full_path
-            )
-        assert file_list == expected
+        ],
+        ids=["missing_dataset", "empty_fileview"],
+    )
+    @pytest.mark.parametrize(
+        "full_path,",
+        [
+            (True),
+            (False),
+        ],
+    )
+    def test_get_files_in_storage_dataset_exception(
+        self,
+        synapse_store,
+        mock_table_dataframe,
+        raised_exception,
+        exception_message,
+        full_path,
+    ):
+        with patch.object(
+            synapse_store, "storageFileviewTable", mock_table_dataframe
+        ), patch.object(
+            synapse_store, "storageFileview", "mock_table_id"
+        ) as mock_table_id, patch.object(
+            synapse_store, "query_fileview"
+        ) as mocked_query:
+            # query_fileview is the function called to get the fileview
+            mocked_query.return_value = mock_table_dataframe
+
+            with pytest.raises(raised_exception, match=exception_message):
+                synapse_store.getFilesInStorageDataset(
+                    datasetId="syn_mock", fileNames=None, fullpath=full_path
+                )
 
     @pytest.mark.parametrize("downloadFile", [True, False])
     def test_getDatasetManifest(self, synapse_store, downloadFile):
@@ -926,6 +985,26 @@ async def mock_success_coro() -> None:
             await synapse_store._process_store_annos(new_tasks)
             mock_store_async.assert_not_called()
 
+    @pytest.mark.parametrize(
+        "dataset_id, dataset_folder_list, expected_clause",
+        [
+            ("syn12345678", None, "parentId='syn12345678'"),
+            (
+                "syn63927665",
+                ["syn63927665", "syn63927667"],
+                "parentId IN ('syn63927665', 'syn63927667')",
+            ),
+            (None, None, ""),
+        ],
+    )
+    def test_build_clause_from_dataset_id(
+        self, dataset_id, dataset_folder_list, expected_clause
+    ):
+        dataset_clause = SynapseStorage.build_clause_from_dataset_id(
+            dataset_id=dataset_id, dataset_folder_list=dataset_folder_list
+        )
+        assert dataset_clause == expected_clause
+
 
 class TestDatasetFileView:
     def test_init(self, dataset_id, dataset_fileview, synapse_store):
@@ -1244,7 +1323,7 @@ async def copy_folder_and_update_manifest(
         table_id = synapse_store.syn.findEntityId(name=table_name, parent=projectId)
         days_to_follow_up = (
             synapse_store.syn.tableQuery(f"SELECT {column_of_interest} FROM {table_id}")
-            .asDataFrame()
+            .asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False)
             .squeeze()
         )
 
@@ -1281,7 +1360,7 @@ async def copy_folder_and_update_manifest(
         table_id = synapse_store.syn.findEntityId(name=table_name, parent=projectId)
         days_to_follow_up = (
             synapse_store.syn.tableQuery(f"SELECT {column_of_interest} FROM {table_id}")
-            .asDataFrame()
+            .asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False)
             .squeeze()
         )
 
@@ -1343,7 +1422,7 @@ async def test_upsert_table(
         # Query table for DaystoFollowUp column
         table_query = (
             synapse_store.syn.tableQuery(f"SELECT {column_of_interest} FROM {table_id}")
-            .asDataFrame()
+            .asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False)
             .squeeze()
         )
 
@@ -1384,7 +1463,7 @@ async def test_upsert_table(
         table_id = synapse_store.syn.findEntityId(name=table_name, parent=projectId)
         table_query = (
             synapse_store.syn.tableQuery(f"SELECT {column_of_interest} FROM {table_id}")
-            .asDataFrame()
+            .asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False)
             .squeeze()
         )
 
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 2a0744439..4a8d30cd3 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -2,19 +2,11 @@
 import json
 import logging
 import os
-import tempfile
-import time
-from datetime import datetime
-from pathlib import Path
-from typing import Generator, Union
 
 import numpy as np
 import pandas as pd
 import pytest
-import synapseclient.core.cache as cache
-from _pytest.fixtures import FixtureRequest
 from pandas.testing import assert_frame_equal
-from synapseclient.core.exceptions import SynapseHTTPError
 
 from schematic.models.metadata import MetadataModel
 from schematic.models.validate_manifest import ValidateManifest
@@ -26,14 +18,8 @@
     convert_graph_to_jsonld,
 )
 from schematic.schemas.data_model_parser import DataModelParser
-from schematic.utils import cli_utils, df_utils, general, io_utils, validate_utils
-from schematic.utils.df_utils import load_df
-from schematic.utils.general import (
-    calculate_datetime,
-    check_synapse_cache_size,
-    clear_synapse_cache,
-    entity_type_mapping,
-)
+from schematic.utils import cli_utils, df_utils, io_utils, validate_utils
+from schematic.utils.df_utils import load_df, read_csv
 from schematic.utils.schema_utils import (
     check_for_duplicate_components,
     check_if_display_name_is_valid_label,
@@ -168,13 +154,6 @@
 
 DATA_MODEL_DICT = {"example.model.csv": "CSV", "example.model.jsonld": "JSONLD"}
 
-test_disk_storage = [
-    (2, 4000, 16000),
-    (1000, 4000, 16000),
-    (2000000, 1900000, 2000000),
-    (1073741825, 1073741824, 1181116006.4),
-]
-
 
 def get_metadataModel(helpers, model_name: str):
     metadataModel = MetadataModel(
@@ -185,164 +164,6 @@ def get_metadataModel(helpers, model_name: str):
     return metadataModel
 
 
-# create temporary files with various size based on request
-@pytest.fixture()
-def create_temp_query_file(
-    tmp_path: Path, request: FixtureRequest
-) -> Generator[tuple[Path, Path, Path], None, None]:
-    """create temporary files of various size based on request parameter.
-
-    Args:
-        tmp_path (Path): temporary file path
-        request (any): a request for a fixture from a test
-
-    Yields:
-        Generator[Tuple[Path, Path, Path]]: return path of mock synapse cache directory, mock table query folder and csv
-    """
-    # define location of mock synapse cache
-    mock_synapse_cache_dir = tmp_path / ".synapseCache/"
-    mock_synapse_cache_dir.mkdir()
-    mock_sub_folder = mock_synapse_cache_dir / "123"
-    mock_sub_folder.mkdir()
-    mock_table_query_folder = mock_sub_folder / "456"
-    mock_table_query_folder.mkdir()
-
-    # create mock table query csv
-    mock_synapse_table_query_csv = (
-        mock_table_query_folder / "mock_synapse_table_query.csv"
-    )
-    with open(mock_synapse_table_query_csv, "wb") as f:
-        f.write(b"\0" * request.param)
-    yield mock_synapse_cache_dir, mock_table_query_folder, mock_synapse_table_query_csv
-
-
-class TestGeneral:
-    @pytest.mark.parametrize("create_temp_query_file", [3, 1000], indirect=True)
-    def test_clear_synapse_cache(self, create_temp_query_file) -> None:
-        # define location of mock synapse cache
-        (
-            mock_synapse_cache_dir,
-            mock_table_query_folder,
-            mock_synapse_table_query_csv,
-        ) = create_temp_query_file
-        # create a mock cache map
-        mock_cache_map = mock_table_query_folder / ".cacheMap"
-        mock_cache_map.write_text(
-            f"{mock_synapse_table_query_csv}: '2022-06-13T19:24:27.000Z'"
-        )
-
-        assert os.path.exists(mock_synapse_table_query_csv)
-
-        # since synapse python client would compare last modified date and before date
-        # we have to create a little time gap here
-        time.sleep(1)
-
-        # clear cache
-        my_cache = cache.Cache(cache_root_dir=mock_synapse_cache_dir)
-        clear_synapse_cache(my_cache, minutes=0.0001)
-        # make sure that cache files are now gone
-        assert os.path.exists(mock_synapse_table_query_csv) == False
-        assert os.path.exists(mock_cache_map) == False
-
-    def test_calculate_datetime_before_minutes(self):
-        input_date = datetime.strptime("07/20/23 17:36:34", "%m/%d/%y %H:%M:%S")
-        minutes_before = calculate_datetime(
-            input_date=input_date, minutes=10, before_or_after="before"
-        )
-        expected_result_date_before = datetime.strptime(
-            "07/20/23 17:26:34", "%m/%d/%y %H:%M:%S"
-        )
-        assert minutes_before == expected_result_date_before
-
-    def test_calculate_datetime_after_minutes(self):
-        input_date = datetime.strptime("07/20/23 17:36:34", "%m/%d/%y %H:%M:%S")
-        minutes_after = calculate_datetime(
-            input_date=input_date, minutes=10, before_or_after="after"
-        )
-        expected_result_date_after = datetime.strptime(
-            "07/20/23 17:46:34", "%m/%d/%y %H:%M:%S"
-        )
-        assert minutes_after == expected_result_date_after
-
-    def test_calculate_datetime_raise_error(self):
-        with pytest.raises(ValueError):
-            input_date = datetime.strptime("07/20/23 17:36:34", "%m/%d/%y %H:%M:%S")
-            minutes = calculate_datetime(
-                input_date=input_date, minutes=10, before_or_after="error"
-            )
-
-    # this test might fail for windows machine
-    @pytest.mark.not_windows
-    @pytest.mark.parametrize(
-        "create_temp_query_file,local_disk_size,gh_disk_size",
-        test_disk_storage,
-        indirect=["create_temp_query_file"],
-    )
-    def test_check_synapse_cache_size(
-        self,
-        create_temp_query_file,
-        local_disk_size: int,
-        gh_disk_size: Union[int, float],
-    ) -> None:
-        mock_synapse_cache_dir, _, _ = create_temp_query_file
-        disk_size = check_synapse_cache_size(mock_synapse_cache_dir)
-
-        # For some reasons, when running in github action, the size of file changes.
-        if IN_GITHUB_ACTIONS:
-            assert disk_size == gh_disk_size
-        else:
-            assert disk_size == local_disk_size
-
-    def test_find_duplicates(self):
-        mock_list = ["foo", "bar", "foo"]
-        mock_dups = {"foo"}
-
-        test_dups = general.find_duplicates(mock_list)
-        assert test_dups == mock_dups
-
-    def test_dict2list_with_dict(self):
-        mock_dict = {"foo": "bar"}
-        mock_list = [{"foo": "bar"}]
-
-        test_list = general.dict2list(mock_dict)
-        assert test_list == mock_list
-
-    def test_dict2list_with_list(self):
-        # mock_dict = {'foo': 'bar'}
-        mock_list = [{"foo": "bar"}]
-
-        test_list = general.dict2list(mock_list)
-        assert test_list == mock_list
-
-    @pytest.mark.parametrize(
-        "entity_id,expected_type",
-        [
-            ("syn27600053", "folder"),
-            ("syn29862078", "file"),
-            ("syn23643253", "asset view"),
-            ("syn30988314", "folder"),
-            ("syn51182432", "org.sagebionetworks.repo.model.table.TableEntity"),
-        ],
-    )
-    def test_entity_type_mapping(self, synapse_store, entity_id, expected_type):
-        syn = synapse_store.syn
-
-        entity_type = entity_type_mapping(syn, entity_id)
-        assert entity_type == expected_type
-
-    def test_entity_type_mapping_invalid_entity_id(self, synapse_store):
-        syn = synapse_store.syn
-
-        # test with an invalid entity id
-        with pytest.raises(SynapseHTTPError) as exception_info:
-            entity_type_mapping(syn, "syn123456")
-
-    def test_download_manifest_to_temp_folder(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            path_dir = general.create_temp_folder(tmpdir)
-            assert os.path.exists(path_dir)
-
-
 class TestCliUtils:
     def test_query_dict(self):
         mock_dict = {"k1": {"k2": {"k3": "foobar"}}}
@@ -473,7 +294,7 @@ def test_load_df(self, helpers, preserve_raw_input):
         test_col = "Check NA"
         file_path = helpers.get_data_path("mock_manifests", "Invalid_Test_Manifest.csv")
 
-        unprocessed_df = pd.read_csv(file_path, encoding="utf8")
+        unprocessed_df = read_csv(file_path, encoding="utf8")
         df = df_utils.load_df(
             file_path, preserve_raw_input=preserve_raw_input, data_model=False
         )
@@ -1086,6 +907,11 @@ def test_validate_property_schema(self, helpers):
                 "example.model.csv",
                 "Patient",
             ),
+            (
+                "mock_manifests/Valid_none_value_test_manifest.csv",
+                "example.model.csv",
+                "Biospecimen",
+            ),
             (
                 "mock_manifests/Valid_Test_Manifest_with_nones.csv",
                 "example_test_nones.model.csv",
@@ -1100,7 +926,7 @@ def test_convert_nan_entries_to_empty_strings(
         manifest_path = helpers.get_data_path(manifest)
         model_path = helpers.get_data_path(model)
 
-        ## Gather parmeters needed to run validate_manifest_rules
+        # Gather parmeters needed to run validate_manifest_rules
         errors = []
         load_args = {
             "dtype": "string",
@@ -1122,13 +948,13 @@ def test_convert_nan_entries_to_empty_strings(
             **load_args,
         )
 
-        metadataModel = get_metadataModel(helpers, model)
+        get_metadataModel(helpers, model)
 
         # Instantiate Validate manifest, and run manifest validation
         # In this step the manifest is modified while running rule
         # validation so need to do this step to get the updated manfest.
         vm = ValidateManifest(errors, manifest, manifest_path, dmge, json_schema)
-        manifest, vmr_errors, vmr_warnings = vm.validate_manifest_rules(
+        manifest, _, _ = vm.validate_manifest_rules(
             manifest,
             dmge,
             restrict_rules=False,
@@ -1143,6 +969,10 @@ def test_convert_nan_entries_to_empty_strings(
         if root_node == "Patient":
             assert manifest["Family History"][0] == ["<NA>"]
             assert output["Family History"][0] == [""]
+        elif root_node == "Biospecimen":
+            assert output["Tissue Status"][2] == "None"
+            assert output["Tissue Status"][3] == "None"
+            assert output["Tissue Status"][4] == "None"
         elif root_node == "MockComponent":
             assert manifest["Check List"][2] == ["<NA>"]
             assert manifest["Check List Like Enum"][2] == []
diff --git a/tests/test_validation.py b/tests/test_validation.py
index c16a95bb7..646a2e543 100644
--- a/tests/test_validation.py
+++ b/tests/test_validation.py
@@ -54,11 +54,17 @@ class TestManifestValidation:
                 "mock_manifests/Valid_Test_Manifest_with_nones.csv",
                 "MockComponent",
             ),
+            (
+                "example.model.csv",
+                "mock_manifests/Valid_none_value_test_manifest.csv",
+                "Biospecimen",
+            ),
         ],
         ids=[
             "example_model",
             "example_with_no_entry_for_cond_required_columns",
-            "example_with_nones",
+            "example_with_nones_mock_component",
+            "example_with_nones_biospecimen",
         ],
     )
     @pytest.mark.parametrize(
diff --git a/tests/test_viz.py b/tests/test_viz.py
index b94d79688..2ab78b9ce 100644
--- a/tests/test_viz.py
+++ b/tests/test_viz.py
@@ -1,11 +1,10 @@
 import json
 import logging
-import os
 from io import StringIO
 
-import pandas as pd
 import pytest
 
+from schematic.utils.df_utils import read_csv
 from schematic.visualization.attributes_explorer import AttributesExplorer
 from schematic.visualization.tangled_tree import TangledTree
 
@@ -44,7 +43,7 @@ class TestVisualization:
     def test_ae(self, helpers, attributes_explorer):
         attributes_str = attributes_explorer.parse_attributes(save_file=False)
 
-        df = pd.read_csv(StringIO(attributes_str)).drop(columns=["Unnamed: 0"])
+        df = read_csv(StringIO(attributes_str)).drop(columns=["Unnamed: 0"])
 
         # For the attributes df define expected columns
         expect_col_names = [
@@ -76,7 +75,7 @@ def test_ce(self, component, attributes_explorer):
             component=component, save_file=False, include_index=False
         )
         # convert to dataframe
-        component_attributes = pd.read_csv(StringIO(component_attributes_str))
+        component_attributes = read_csv(StringIO(component_attributes_str))
 
         # For the attributes df define expected columns
         expect_col_names = [
@@ -103,7 +102,7 @@ def test_text(self, helpers, tangled_tree):
         # Get text for tangled tree.
         text_str = tangled_tree.get_text_for_tangled_tree(text_format, save_file=False)
 
-        df = pd.read_csv(StringIO(text_str)).drop(columns=["Unnamed: 0"])
+        df = read_csv(StringIO(text_str)).drop(columns=["Unnamed: 0"])
 
         # Define expected text associated with 'Patient' and 'Imaging' tree
         expected_patient_text = ["Biospecimen", "BulkRNA-seqAssay"]
diff --git a/tests/unit/test_df_utils.py b/tests/unit/test_df_utils.py
new file mode 100644
index 000000000..28db591a2
--- /dev/null
+++ b/tests/unit/test_df_utils.py
@@ -0,0 +1,49 @@
+from io import BytesIO
+
+import numpy as np
+import pandas as pd
+from pandas._libs.parsers import STR_NA_VALUES
+
+from schematic.utils.df_utils import read_csv
+
+
+class TestReadCsv:
+    def test_none_in_na_values(self) -> None:
+        # GIVEN a pandas DF that includes a column with a None value
+        df = pd.DataFrame({"col1": ["AAA", "BBB", "None"]})
+
+        # AND None is included in the STR_NA_VALUES
+        if "None" not in STR_NA_VALUES:
+            STR_NA_VALUES.add("None")
+
+        # AND its CSV representation
+        csv_buffer = BytesIO()
+        df.to_csv(csv_buffer, index=False)
+        csv_buffer.seek(0)
+
+        # WHEN the CSV is read
+        result = read_csv(csv_buffer, na_values=STR_NA_VALUES)
+
+        # THEN the None string value is not preserved
+        assert not result.equals(df)
+        assert result["col1"][0] == "AAA"
+        assert result["col1"][1] == "BBB"
+        assert result["col1"][2] is np.nan
+
+    def test_none_not_in_na_values(self) -> None:
+        # GIVEN a pandas DF that includes a column with a None value
+        df = pd.DataFrame({"col1": ["AAA", "BBB", "None"]})
+
+        # AND its CSV representation
+        csv_buffer = BytesIO()
+        df.to_csv(csv_buffer, index=False)
+        csv_buffer.seek(0)
+
+        # WHEN the CSV is read
+        result = read_csv(csv_buffer)
+
+        # THEN the None string value is preserved
+        assert result.equals(df)
+        assert result["col1"][0] == "AAA"
+        assert result["col1"][1] == "BBB"
+        assert result["col1"][2] == "None"
diff --git a/tests/unit/test_store_synapse.py b/tests/unit/test_store_synapse.py
new file mode 100644
index 000000000..c2b8d1403
--- /dev/null
+++ b/tests/unit/test_store_synapse.py
@@ -0,0 +1,59 @@
+import pandas as pd
+import pytest
+
+from schematic.store.synapse import SynapseStorage
+
+
+class TestStoreSynapse:
+    synapse_store = SynapseStorage()
+    test_manifest_good = pd.DataFrame(
+        {"Filename": ["test_file.txt"], "entityId": ["syn1"]}
+    )
+    test_manifest_no_entityid = pd.DataFrame({"Filename": ["test_file.txt"]})
+    test_dataset_files = [
+        ("syn2", "test_file_1.txt"),
+        ("syn3", "test_file_2.txt"),
+    ]
+
+    def test_get_file_entityIds_only_new_files_manifest_is_none(self) -> None:
+        with pytest.raises(UnboundLocalError, match="No manifest was passed in"):
+            self.synapse_store._get_file_entityIds(
+                dataset_files=[], only_new_files=True, manifest=None
+            )
+
+    def test_get_file_entityIds_only_new_files_manifest_no_entityId_column(
+        self,
+    ) -> None:
+        with pytest.raises(ValueError, match="The manifest in your dataset"):
+            self.synapse_store._get_file_entityIds(
+                dataset_files=[],
+                only_new_files=True,
+                manifest=self.test_manifest_no_entityid,
+            )
+
+    def test_get_file_entityIds_only_new_files_manifest_good(self) -> None:
+        assert self.synapse_store._get_file_entityIds(
+            dataset_files=self.test_dataset_files,
+            only_new_files=True,
+            manifest=self.test_manifest_good,
+        ) == {
+            "Filename": ["test_file_1.txt", "test_file_2.txt"],
+            "entityId": ["syn2", "syn3"],
+        }
+
+    def test_get_file_entityIds_only_new_files_manifest_good_no_new_files(self) -> None:
+        assert self.synapse_store._get_file_entityIds(
+            dataset_files=[("syn1", "test_file.txt")],
+            only_new_files=True,
+            manifest=self.test_manifest_good,
+        ) == {"Filename": [], "entityId": []}
+
+    def test_get_file_entityIds_all_files(self) -> None:
+        assert self.synapse_store._get_file_entityIds(
+            dataset_files=self.test_dataset_files,
+            only_new_files=False,
+            manifest=self.test_manifest_good,
+        ) == {
+            "Filename": ["test_file_1.txt", "test_file_2.txt"],
+            "entityId": ["syn2", "syn3"],
+        }
diff --git a/tests/unit/test_utils_general.py b/tests/unit/test_utils_general.py
new file mode 100644
index 000000000..ed8a7913f
--- /dev/null
+++ b/tests/unit/test_utils_general.py
@@ -0,0 +1,236 @@
+import os
+import tempfile
+import time
+from datetime import datetime
+from pathlib import Path
+from typing import Generator
+from unittest import mock
+from unittest.mock import MagicMock
+
+import pytest
+import synapseclient.core.cache as cache
+from _pytest.fixtures import FixtureRequest
+from synapseclient.core.exceptions import SynapseHTTPError
+
+from schematic.utils import general
+from schematic.utils.general import (
+    calculate_datetime,
+    check_synapse_cache_size,
+    clear_synapse_cache,
+    entity_type_mapping,
+)
+
+TEST_DISK_STORAGE = [
+    (2, 2),
+    (1000, 1000),
+    (2000000, 2000000),
+    (1073741825, 1073741825),
+]
+
+
+# create temporary files with various size based on request
+@pytest.fixture()
+def create_temp_query_file(
+    tmp_path: Path, request: FixtureRequest
+) -> Generator[tuple[Path, Path, Path], None, None]:
+    """create temporary files of various size based on request parameter.
+
+    Args:
+        tmp_path (Path): temporary file path
+        request (any): a request for a fixture from a test
+
+    Yields:
+        Generator[Tuple[Path, Path, Path]]: return path of mock synapse cache directory, mock table query folder and csv
+    """
+    # define location of mock synapse cache
+    mock_synapse_cache_dir = tmp_path / ".synapseCache/"
+    mock_synapse_cache_dir.mkdir()
+    mock_sub_folder = mock_synapse_cache_dir / "123"
+    mock_sub_folder.mkdir()
+    mock_table_query_folder = mock_sub_folder / "456"
+    mock_table_query_folder.mkdir()
+
+    # create mock table query csv
+    mock_synapse_table_query_csv = (
+        mock_table_query_folder / "mock_synapse_table_query.csv"
+    )
+    with open(mock_synapse_table_query_csv, "wb") as f:
+        f.write(b"\0" * request.param)
+    yield mock_synapse_cache_dir, mock_table_query_folder, mock_synapse_table_query_csv
+
+
+@pytest.mark.parametrize(
+    "directory, file_sizes, expected_size",
+    [
+        ("", [1024], 1024),  # Default directory with 1 KB file
+        ("/custom/directory", [2048], 2048),  # Custom directory with 2 KB file
+        ("", [], 0),  # Empty directory
+        ("", [1024, 2048], 3072),  # Directory with multiple files (1 KB + 2 KB)
+    ],
+)
+def test_check_synapse_cache_size(mocker, directory, file_sizes, expected_size):
+    """Test that the file sizes add up to the correct bytes"""
+    # Create a list of mocked files based on file_sizes
+    mock_files = []
+    for size in file_sizes:
+        mock_file = MagicMock()
+        mock_file.is_file.return_value = True
+        mock_file.stat.return_value.st_size = size
+        mock_files.append(mock_file)
+
+    # Mock Path().rglob() to return the mocked files
+    mock_rglob = mocker.patch(
+        "schematic.utils.general.Path.rglob", return_value=mock_files
+    )
+    # Call the function with the directory parameter
+    result = check_synapse_cache_size(directory=directory)
+
+    # Assert the result matches the expected size
+    assert result == expected_size
+
+
+@pytest.mark.parametrize(
+    "directory",
+    [
+        None,  # Default directory
+        "/custom/directory",  # Custom directory
+    ],
+)
+def test_check_synapse_cache_size_directory_call(directory):
+    """Test that the right directory is passed in"""
+    with mock.patch("schematic.utils.general.Path") as mock_path:
+        mock_rglob = MagicMock(return_value=[])
+        mock_path.return_value.rglob = mock_rglob
+
+        # Call the function with the directory parameter
+        if directory is None:
+            check_synapse_cache_size()
+        else:
+            check_synapse_cache_size(directory=directory)
+
+        # Assert that Path was called with the correct directory
+        expected_directory = directory if directory else "/root/.synapseCache"
+        mock_path.assert_called_with(expected_directory)
+
+        # Assert that rglob was called on the Path object
+        mock_rglob.assert_called_once_with("*")
+
+
+class TestGeneral:
+    @pytest.mark.parametrize("create_temp_query_file", [3, 1000], indirect=True)
+    def test_clear_synapse_cache(self, create_temp_query_file) -> None:
+        # define location of mock synapse cache
+        (
+            mock_synapse_cache_dir,
+            mock_table_query_folder,
+            mock_synapse_table_query_csv,
+        ) = create_temp_query_file
+        # create a mock cache map
+        mock_cache_map = mock_table_query_folder / ".cacheMap"
+        mock_cache_map.write_text(
+            f"{mock_synapse_table_query_csv}: '2022-06-13T19:24:27.000Z'"
+        )
+
+        assert os.path.exists(mock_synapse_table_query_csv)
+
+        # since synapse python client would compare last modified date and before date
+        # we have to create a little time gap here
+        time.sleep(1)
+
+        # clear cache
+        my_cache = cache.Cache(cache_root_dir=mock_synapse_cache_dir)
+        clear_synapse_cache(my_cache, minutes=0.0001)
+        # make sure that cache files are now gone
+        assert os.path.exists(mock_synapse_table_query_csv) == False
+        assert os.path.exists(mock_cache_map) == False
+
+    def test_calculate_datetime_before_minutes(self):
+        input_date = datetime.strptime("07/20/23 17:36:34", "%m/%d/%y %H:%M:%S")
+        minutes_before = calculate_datetime(
+            input_date=input_date, minutes=10, before_or_after="before"
+        )
+        expected_result_date_before = datetime.strptime(
+            "07/20/23 17:26:34", "%m/%d/%y %H:%M:%S"
+        )
+        assert minutes_before == expected_result_date_before
+
+    def test_calculate_datetime_after_minutes(self):
+        input_date = datetime.strptime("07/20/23 17:36:34", "%m/%d/%y %H:%M:%S")
+        minutes_after = calculate_datetime(
+            input_date=input_date, minutes=10, before_or_after="after"
+        )
+        expected_result_date_after = datetime.strptime(
+            "07/20/23 17:46:34", "%m/%d/%y %H:%M:%S"
+        )
+        assert minutes_after == expected_result_date_after
+
+    def test_calculate_datetime_raise_error(self):
+        with pytest.raises(ValueError):
+            input_date = datetime.strptime("07/20/23 17:36:34", "%m/%d/%y %H:%M:%S")
+            minutes = calculate_datetime(
+                input_date=input_date, minutes=10, before_or_after="error"
+            )
+
+    # this test might fail for windows machine
+    @pytest.mark.parametrize(
+        "create_temp_query_file,local_disk_size",
+        TEST_DISK_STORAGE,
+        indirect=["create_temp_query_file"],
+    )
+    def test_check_synapse_cache_size(
+        self,
+        create_temp_query_file,
+        local_disk_size: int,
+    ) -> None:
+        mock_synapse_cache_dir, _, _ = create_temp_query_file
+        disk_size = check_synapse_cache_size(mock_synapse_cache_dir)
+        assert disk_size == local_disk_size
+
+    def test_find_duplicates(self):
+        mock_list = ["foo", "bar", "foo"]
+        mock_dups = {"foo"}
+
+        test_dups = general.find_duplicates(mock_list)
+        assert test_dups == mock_dups
+
+    def test_dict2list_with_dict(self):
+        mock_dict = {"foo": "bar"}
+        mock_list = [{"foo": "bar"}]
+
+        test_list = general.dict2list(mock_dict)
+        assert test_list == mock_list
+
+    def test_dict2list_with_list(self):
+        # mock_dict = {'foo': 'bar'}
+        mock_list = [{"foo": "bar"}]
+
+        test_list = general.dict2list(mock_list)
+        assert test_list == mock_list
+
+    @pytest.mark.parametrize(
+        "entity_id,expected_type",
+        [
+            ("syn27600053", "folder"),
+            ("syn29862078", "file"),
+            ("syn23643253", "asset view"),
+            ("syn30988314", "folder"),
+            ("syn51182432", "org.sagebionetworks.repo.model.table.TableEntity"),
+        ],
+    )
+    def test_entity_type_mapping(self, synapse_store, entity_id, expected_type):
+        syn = synapse_store.syn
+
+        entity_type = entity_type_mapping(syn, entity_id)
+        assert entity_type == expected_type
+
+    def test_entity_type_mapping_invalid_entity_id(self, synapse_store):
+        syn = synapse_store.syn
+
+        # test with an invalid entity id
+        with pytest.raises(SynapseHTTPError) as exception_info:
+            entity_type_mapping(syn, "syn123456")
+
+    def test_download_manifest_to_temp_folder(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            path_dir = general.create_temp_folder(tmpdir)
+            assert os.path.exists(path_dir)