From 6b9119f51e876aa52afad40f080821fb1dea58a1 Mon Sep 17 00:00:00 2001
From: Tom Close <tom.g.close@gmail.com>
Date: Thu, 15 Feb 2024 22:24:00 +1100
Subject: [PATCH] adapted always-include option to use mime-likes

---
 xnat_ingest/cli/upload.py | 39 +++++++++++++++++++++------------------
 xnat_ingest/session.py    | 23 +++++++++--------------
 2 files changed, 30 insertions(+), 32 deletions(-)

diff --git a/xnat_ingest/cli/upload.py b/xnat_ingest/cli/upload.py
index 897f1ae..c8a9fb0 100644
--- a/xnat_ingest/cli/upload.py
+++ b/xnat_ingest/cli/upload.py
@@ -95,12 +95,16 @@
 @click.option(
     "--always-include",
     "-i",
-    default=None,
-    type=click.Choice(("all", "dicom", "associated"), case_sensitive=False),
+    default=(),
+    type=str,
+    multiple=True,
     envvar="XNAT_INGEST_ALWAYSINCLUDE",
     help=(
-        "Whether to include scans in the upload regardless of whether they are "
-        "specified in a column or not"
+        "Scan types to always include in the upload, regardless of whether they are"
+        "specified in a column or not. Specified using the scan types IANA mime-type or "
+        "fileformats \"mime-like\" (see https://arcanaframework.github.io/fileformats/), "
+        "e.g. 'application/json', 'medimage/dicom-series', "
+        "'image/jpeg'). Use 'core/file-set' to include all file-types in the session"
     ),
 )
 @click.option(
@@ -145,7 +149,7 @@ def upload(
     log_file: Path,
     log_emails: LogEmail,
     mail_server: MailServer,
-    always_include: str,
+    always_include: ty.Sequence[str],
     raise_errors: bool,
     store_credentials: ty.Tuple[str, str],
     work_dir: ty.Optional[Path],
@@ -275,6 +279,18 @@ def iter_staged_sessions():
 
                 # Create corresponding session on XNAT
                 xproject = xnat_repo.connection.projects[session.project_id]
+
+                # Access Arcana dataset associated with project
+                try:
+                    dataset = Dataset.load(session.project_id, xnat_repo)
+                except Exception as e:
+                    logger.warning(
+                        e,
+                        f"Did not load dataset definition from {session.project_id} project "
+                        f"on {server}. Only the scan types specified in --always-include",
+                    )
+                    dataset = None
+
                 xsubject = xnat_repo.connection.classes.SubjectData(
                     label=session.subject_id, parent=xproject
                 )
@@ -297,19 +313,6 @@ def iter_staged_sessions():
                     f"{session.project_id}:{session.subject_id}:{session.visit_id}"
                 )
 
-                # Access Arcana dataset associated with project
-                try:
-                    dataset = Dataset.load(session.project_id, xnat_repo)
-                except Exception as e:
-                    add_exc_note(
-                        e,
-                        f"Did not load dataset definition from {session.project_id} project "
-                        f"on {server}. Please set one up using the Arcana command line tool "
-                        "in order to check presence of required scans and associated "
-                        "files (e.g. raw-data exports)",
-                    )
-                    raise e
-
                 # Anonymise DICOMs and save to directory prior to upload
                 if always_include:
                     logger.info(
diff --git a/xnat_ingest/session.py b/xnat_ingest/session.py
index 6503699..cf6a64a 100644
--- a/xnat_ingest/session.py
+++ b/xnat_ingest/session.py
@@ -104,8 +104,8 @@ def dicom_dirs(self) -> ty.List[Path]:
 
     def select_resources(
         self,
-        dataset: Dataset,
-        always_include: ty.Optional[str] = None,
+        dataset: ty.Optional[Dataset],
+        always_include: ty.Sequence[str] = (),
     ) -> ty.Iterator[ty.Tuple[str, str, str, FileSet]]:
         """Returns selected resources that match the columns in the dataset definition
 
@@ -113,10 +113,10 @@ def select_resources(
         ----------
         dataset : Dataset
             Arcana dataset definition
-        always_include : str, optional
-            whether to scans regardless of whether they are explicitly
-            specified by a column in the dataset or not. Valid options are
-            'all', 'dicoms', 'associated'
+        always_include : sequence[str]
+            mime-types or "mime-like" (see https://arcanaframework.github.io/fileformats/)
+            of file-format to always include in the upload, regardless of whether they are
+            specified in the dataset or not
 
         Yields
         ------
@@ -132,16 +132,11 @@ def select_resources(
         store = MockDataStore(self)
 
         uploaded = set()
-        if always_include:
+        for mime_like in always_include:
+            fileformat = from_mime(mime_like)
             for scan in self.scans.values():
                 for resource_name, fileset in scan.resources.items():
-                    if (
-                        always_include == "all"
-                        or always_include == "dicom"
-                        and resource_name == "DICOM"
-                        or always_include == "associated"
-                        and resource_name != "DICOM"
-                    ):
+                    if isinstance(fileset, fileformat):
                         uploaded.add((scan.id, resource_name))
                         yield scan.id, scan.type, resource_name, fileset
         for column in dataset.columns.values():