From 1d1bc62b1848d40ca60ab37a79462f925ef71f66 Mon Sep 17 00:00:00 2001 From: Tom Close Date: Tue, 1 Oct 2024 12:10:55 +1000 Subject: [PATCH 1/9] fixed up handling of alternative datatypes --- xnat_ingest/cli/stage.py | 35 +++++++++++++++++++++---------- xnat_ingest/tests/test_session.py | 15 +------------ 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/xnat_ingest/cli/stage.py b/xnat_ingest/cli/stage.py index 79eba75..cddcc1f 100644 --- a/xnat_ingest/cli/stage.py +++ b/xnat_ingest/cli/stage.py @@ -7,6 +7,7 @@ import tempfile from tqdm import tqdm from fileformats.core import FileSet +from fileformats.medimage import DicomSeries from xnat_ingest.cli.base import cli from xnat_ingest.session import ImagingSession from frametree.xnat import Xnat # type: ignore[import-untyped] @@ -25,13 +26,13 @@ @cli.command( - help="""Stages DICOM and associated files found in the input directories into separate -directories for each session + help="""Stages images found in the input directories into separate directories for each +imaging acquisition session -DICOMS_PATH is either the path to a directory containing the DICOM files to upload, or -a glob pattern that selects the DICOM paths directly +FILES_PATH is either the path to a directory containing the files to upload, or +a glob pattern that selects the paths directly -STAGING_DIR is the directory that the files for each session are collated to before they +OUTPUT_DIR is the directory that the files for each session are collated to before they are uploaded to XNAT """, ) @@ -42,9 +43,15 @@ type=str, metavar="", multiple=True, - default=["medimage/dicom-series"], - envvar="XINGEST_DATATYPE", - help="The datatype of the primary files to to upload", + default=None, + envvar="XINGEST_DATATYPES", + help=( + 'The MIME-type(s) (or "MIME-like" see FileFormats docs) of potential datatype(s) ' + "of the primary files to to upload, defaults to 'medimage/dicom-series'. " + "Any formats implemented in the FileFormats Python package " + "(https://github.com/ArcanaFramework/fileformats) that implement the 'read_metadata' " + '"extra" are supported, see FF docs on how to add support for new formats.' + ), ) @click.option( "--project-field", @@ -250,7 +257,7 @@ def stage( files_path: str, output_dir: Path, - datatype: str, + datatype: list[str] | None, associated_files: ty.List[AssociatedFiles], project_field: str, subject_field: str, @@ -279,6 +286,11 @@ def stage( logger_configs=loggers, additional_loggers=additional_loggers, ) + datatypes: list[ty.Type[FileSet]] + if datatype is None: + datatypes = [DicomSeries] + else: + datatypes = [FileSet.from_mime(dt) for dt in datatype] # type: ignore[misc] if xnat_login: xnat_repo = Xnat( @@ -292,10 +304,10 @@ def stage( else: project_list = None - if session_field is None and datatype == "medimage/dicom-series": + if session_field is None and DicomSeries in datatypes: session_field = "StudyInstanceUID" - msg = f"Loading {datatype} sessions from '{files_path}'" + msg = f"Loading {list(datatypes)} sessions from '{files_path}'" for assoc_files in associated_files: msg += f" with associated files selected from '{assoc_files.glob}'" @@ -319,6 +331,7 @@ def stage( def do_stage() -> None: sessions = ImagingSession.from_paths( files_path=files_path, + datatypes=datatypes, project_field=project_field, subject_field=subject_field, visit_field=visit_field, diff --git a/xnat_ingest/tests/test_session.py b/xnat_ingest/tests/test_session.py index a5709a7..7853d4d 100644 --- a/xnat_ingest/tests/test_session.py +++ b/xnat_ingest/tests/test_session.py @@ -1,7 +1,7 @@ from pathlib import Path import pytest import typing as ty -from fileformats.core import from_mime, FileSet +from fileformats.core import from_mime from fileformats.medimage import ( DicomSeries, Vnd_Siemens_Biograph128Vision_Vr20b_PetRawData, @@ -10,22 +10,17 @@ ) from frametree.core.frameset import FrameSet # type: ignore[import-untyped] from frametree.common import FileSystem # type: ignore[import-untyped] -from medimages4tests.dummy.dicom.base import default_dicom_dir # type: ignore[import-untyped] from medimages4tests.dummy.dicom.pet.wholebody.siemens.biograph_vision.vr20b import ( # type: ignore[import-untyped] get_image as get_pet_image, - __file__ as pet_src_file, ) from medimages4tests.dummy.dicom.ct.ac.siemens.biograph_vision.vr20b import ( # type: ignore[import-untyped] get_image as get_ac_image, - __file__ as ac_src_file, ) from medimages4tests.dummy.dicom.pet.topogram.siemens.biograph_vision.vr20b import ( # type: ignore[import-untyped] get_image as get_topogram_image, - __file__ as topogram_src_file, ) from medimages4tests.dummy.dicom.pet.statistics.siemens.biograph_vision.vr20b import ( # type: ignore[import-untyped] get_image as get_statistics_image, - __file__ as statistics_src_file, ) from xnat_ingest.session import ImagingSession, ImagingScan from xnat_ingest.store import DummyAxes @@ -66,26 +61,18 @@ def imaging_session() -> ImagingSession: DicomSeries(d.iterdir()) for d in ( get_pet_image( - out_dir=default_dicom_dir(pet_src_file).with_suffix(".with-spaces"), first_name=FIRST_NAME, last_name=LAST_NAME, ), get_ac_image( - out_dir=default_dicom_dir(ac_src_file).with_suffix(".with-spaces"), first_name=FIRST_NAME, last_name=LAST_NAME, ), get_topogram_image( - out_dir=default_dicom_dir(topogram_src_file).with_suffix( - ".with-spaces" - ), first_name=FIRST_NAME, last_name=LAST_NAME, ), get_statistics_image( - out_dir=default_dicom_dir(statistics_src_file).with_suffix( - ".with-spaces" - ), first_name=FIRST_NAME, last_name=LAST_NAME, ), From 65d557ca3276248f16b0e3af4f1105b9ca62255f Mon Sep 17 00:00:00 2001 From: Tom Close Date: Tue, 1 Oct 2024 12:19:57 +1000 Subject: [PATCH 2/9] handle case where no loggers are specified --- xnat_ingest/utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/xnat_ingest/utils.py b/xnat_ingest/utils.py index f62e584..623c067 100644 --- a/xnat_ingest/utils.py +++ b/xnat_ingest/utils.py @@ -126,6 +126,9 @@ def set_logger_handling( ) -> None: """Set up logging for the application""" + if not logger_configs: + return + loggers = [logger] for log in additional_loggers: loggers.append(logging.getLogger(log)) From c1a1bb27f7813d8e1d051eaea476ac594f6c09e4 Mon Sep 17 00:00:00 2001 From: Tom Close Date: Tue, 1 Oct 2024 12:20:52 +1000 Subject: [PATCH 3/9] set default logger when no handlers are passed --- xnat_ingest/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xnat_ingest/utils.py b/xnat_ingest/utils.py index 623c067..1b33e6e 100644 --- a/xnat_ingest/utils.py +++ b/xnat_ingest/utils.py @@ -127,7 +127,7 @@ def set_logger_handling( """Set up logging for the application""" if not logger_configs: - return + logger_configs = [LoggerConfig("stream", "info", "stdout")] loggers = [logger] for log in additional_loggers: From 2fc8137cafbd9fe52fbbf5c47c4ec0c1ec43e2ca Mon Sep 17 00:00:00 2001 From: Tom Close Date: Tue, 1 Oct 2024 12:38:58 +1000 Subject: [PATCH 4/9] include file paths in the generated checksums to match those used to upload to XNAT --- xnat_ingest/resource.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/xnat_ingest/resource.py b/xnat_ingest/resource.py index 56161d4..401ecab 100644 --- a/xnat_ingest/resource.py +++ b/xnat_ingest/resource.py @@ -27,7 +27,9 @@ class ImagingResource: @checksums.default def calculate_checksums(self) -> dict[str, str]: - return self.fileset.hash_files(crypto=hashlib.md5) + return self.fileset.hash_files( + crypto=hashlib.md5, relative_to=self.fileset.parent + ) @property def datatype(self) -> ty.Type[FileSet]: From 8cae6a87a85908ddf4f26079f4ee5c1f120fc6a7 Mon Sep 17 00:00:00 2001 From: Tom Close Date: Tue, 1 Oct 2024 12:42:41 +1000 Subject: [PATCH 5/9] tag missing IDs --- xnat_ingest/session.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/xnat_ingest/session.py b/xnat_ingest/session.py index 4479895..204d947 100644 --- a/xnat_ingest/session.py +++ b/xnat_ingest/session.py @@ -338,17 +338,20 @@ def get_id(field_type: str, field_name: str) -> str: try: value = resource.metadata[field_name] except KeyError: - if session_uid and field_type in ("project", "subject", "visit"): - value = ( - "INVALID_MISSING_" - + field_type.upper() - + "_" - + "".join( - random.choices( - string.ascii_letters + string.digits, k=8 - ) - ) + value = "" + if ( + not value + and session_uid + and field_type in ("project", "subject", "visit") + ): + value = ( + "INVALID_MISSING_" + + field_type.upper() + + "_" + + "".join( + random.choices(string.ascii_letters + string.digits, k=8) ) + ) raise ImagingSessionParseError( f"Did not find '{field_name}' field in {resource}, " "cannot uniquely identify the resource" From cdcd91405b41f6a193e6db89bf1db5816147fa17 Mon Sep 17 00:00:00 2001 From: Tom Close Date: Tue, 1 Oct 2024 12:46:22 +1000 Subject: [PATCH 6/9] touching up handling of missing IDs --- xnat_ingest/session.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/xnat_ingest/session.py b/xnat_ingest/session.py index 204d947..d85b33a 100644 --- a/xnat_ingest/session.py +++ b/xnat_ingest/session.py @@ -339,23 +339,23 @@ def get_id(field_type: str, field_name: str) -> str: value = resource.metadata[field_name] except KeyError: value = "" - if ( - not value - and session_uid - and field_type in ("project", "subject", "visit") - ): - value = ( - "INVALID_MISSING_" - + field_type.upper() - + "_" - + "".join( - random.choices(string.ascii_letters + string.digits, k=8) + if not value: + if session_uid and field_type in ("project", "subject", "visit"): + value = ( + "INVALID_MISSING_" + + field_type.upper() + + "_" + + "".join( + random.choices( + string.ascii_letters + string.digits, k=8 + ) + ) + ) + else: + raise ImagingSessionParseError( + f"Did not find '{field_name}' field in {resource}, " + "cannot uniquely identify the resource" ) - ) - raise ImagingSessionParseError( - f"Did not find '{field_name}' field in {resource}, " - "cannot uniquely identify the resource" - ) if index is not None: value = value[index] value_str = str(value) From 786061817038ad938527ced1629486d3a794cf5f Mon Sep 17 00:00:00 2001 From: Tom Close Date: Tue, 1 Oct 2024 13:13:05 +1000 Subject: [PATCH 7/9] debugging statement --- xnat_ingest/session.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/xnat_ingest/session.py b/xnat_ingest/session.py index d85b33a..c66da94 100644 --- a/xnat_ingest/session.py +++ b/xnat_ingest/session.py @@ -399,12 +399,16 @@ def get_id(field_type: str, field_name: str) -> str: ) session.add_resource(scan_id, scan_type, resource_id, resource) if multiple_sessions: - raise ImagingSessionParseError( - "Multiple session UIDs found with the same project/subject/visit ID triplets: " - + "\n".join( - f"{i} -> {p}:{s}:{v}" for i, (p, s, v) in multiple_sessions.items() + try: + raise ImagingSessionParseError( + "Multiple session UIDs found with the same project/subject/visit ID triplets: " + + "\n".join( + f"{i} -> {p}:{s}:{v}" + for i, (p, s, v) in multiple_sessions.items() + ) ) - ) + except ValueError: + raise Exception(str(multiple_sessions)) return list(sessions.values()) def deidentify( From 522819b17f629f596ef1d19f507b99e005adb166 Mon Sep 17 00:00:00 2001 From: Tom Close Date: Tue, 1 Oct 2024 13:19:44 +1000 Subject: [PATCH 8/9] debugging handling of missing sessions --- xnat_ingest/session.py | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/xnat_ingest/session.py b/xnat_ingest/session.py index c66da94..4a516e6 100644 --- a/xnat_ingest/session.py +++ b/xnat_ingest/session.py @@ -323,6 +323,7 @@ def from_paths( multiple_sessions: ty.DefaultDict[str, ty.Set[ty.Tuple[str, str, str]]] = ( defaultdict(set) ) + missing_ids: dict[str, dict[str, str]] = defaultdict(dict) for resource in tqdm( resources, "Sorting resources into XNAT tree structure...", @@ -341,16 +342,19 @@ def get_id(field_type: str, field_name: str) -> str: value = "" if not value: if session_uid and field_type in ("project", "subject", "visit"): - value = ( - "INVALID_MISSING_" - + field_type.upper() - + "_" - + "".join( - random.choices( - string.ascii_letters + string.digits, k=8 + try: + value = missing_ids[session_uid][field_type] + except KeyError: + value = missing_ids[session_uid][field_type] = ( + "INVALID_MISSING_" + + field_type.upper() + + "_" + + "".join( + random.choices( + string.ascii_letters + string.digits, k=8 + ) ) ) - ) else: raise ImagingSessionParseError( f"Did not find '{field_name}' field in {resource}, " @@ -399,16 +403,13 @@ def get_id(field_type: str, field_name: str) -> str: ) session.add_resource(scan_id, scan_type, resource_id, resource) if multiple_sessions: - try: - raise ImagingSessionParseError( - "Multiple session UIDs found with the same project/subject/visit ID triplets: " - + "\n".join( - f"{i} -> {p}:{s}:{v}" - for i, (p, s, v) in multiple_sessions.items() - ) + raise ImagingSessionParseError( + "Multiple session UIDs found with the same project/subject/visit ID triplets: " + + "\n".join( + f"{i} -> " + str(["{p}:{s}:{v}" for p, s, v in sess]) + for i, sess in multiple_sessions.items() ) - except ValueError: - raise Exception(str(multiple_sessions)) + ) return list(sessions.values()) def deidentify( From 9bd2c84ad20b10aa0fff542485826a390db5cec5 Mon Sep 17 00:00:00 2001 From: Tom Close Date: Tue, 1 Oct 2024 21:27:53 +1000 Subject: [PATCH 9/9] fixed up datatype option for stage --- .pre-commit-config.yaml | 50 ++++++++++++++++++++-------------------- scripts/get_pet_tst.py | 23 ++++++++++++++++++ xnat_ingest/cli/stage.py | 2 +- xnat_ingest/session.py | 5 ++-- 4 files changed, 52 insertions(+), 28 deletions(-) create mode 100644 scripts/get_pet_tst.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7a881d3..3904cd4 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,28 +1,28 @@ # See https://pre-commit.com for more information # See https://pre-commit.com/hooks.html for more hooks repos: -- repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.3.0 - hooks: - - id: trailing-whitespace - - id: end-of-file-fixer - - id: check-yaml - - id: check-added-large-files -- repo: https://github.com/psf/black - rev: 22.3.0 - hooks: - - id: black - exclude: ^(arcana/_version\.py|versioneer\.py)$ - args: - - -l 88 -- repo: https://github.com/codespell-project/codespell - rev: v2.1.0 - hooks: - - id: codespell - exclude: ^(xnat_checks/_version\.py|versioneer\.py)$ - args: - - --ignore-words=.codespell-ignorewords -- repo: https://github.com/PyCQA/flake8 - rev: 4.0.1 - hooks: - - id: flake8 + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.3.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-yaml + - id: check-added-large-files + - repo: https://github.com/psf/black + rev: 22.3.0 + hooks: + - id: black + exclude: ^(arcana/_version\.py|versioneer\.py)$ + args: + - -l 88 + - repo: https://github.com/codespell-project/codespell + rev: v2.1.0 + hooks: + - id: codespell + exclude: ^(xnat_checks/_version\.py|versioneer\.py)$ + args: + - --ignore-words=.codespell-ignorewords + - repo: https://github.com/PyCQA/flake8 + rev: 7.0.0 + hooks: + - id: flake8 diff --git a/scripts/get_pet_tst.py b/scripts/get_pet_tst.py new file mode 100644 index 0000000..b905798 --- /dev/null +++ b/scripts/get_pet_tst.py @@ -0,0 +1,23 @@ +import tempfile +from pathlib import Path +from fileformats.medimage import DicomSeries +from medimages4tests.dummy.dicom.pet.wholebody.siemens.biograph_vision.vr20b import ( # type: ignore[import-untyped] + get_image as get_pet_image, +) + + +tmp_path = Path(tempfile.mkdtemp()) + +series = DicomSeries( + get_pet_image( + tmp_path, + first_name="first", + last_name="last", + StudyInstanceUID="StudyInstanceUID", + PatientID="PatientID", + AccessionNumber="AccessionNumber", + StudyID="xnat_project", + ).iterdir() +) + +print(series.metadata["StudyID"]) diff --git a/xnat_ingest/cli/stage.py b/xnat_ingest/cli/stage.py index cddcc1f..ce11f53 100644 --- a/xnat_ingest/cli/stage.py +++ b/xnat_ingest/cli/stage.py @@ -287,7 +287,7 @@ def stage( additional_loggers=additional_loggers, ) datatypes: list[ty.Type[FileSet]] - if datatype is None: + if not datatype: datatypes = [DicomSeries] else: datatypes = [FileSet.from_mime(dt) for dt in datatype] # type: ignore[misc] diff --git a/xnat_ingest/session.py b/xnat_ingest/session.py index 4a516e6..2953237 100644 --- a/xnat_ingest/session.py +++ b/xnat_ingest/session.py @@ -357,8 +357,9 @@ def get_id(field_type: str, field_name: str) -> str: ) else: raise ImagingSessionParseError( - f"Did not find '{field_name}' field in {resource}, " - "cannot uniquely identify the resource" + f"Did not find '{field_name}' field in {resource!r}, " + "cannot uniquely identify the resource, found:\n" + + "\n".join(resource.metadata) ) if index is not None: value = value[index]