Merge pull request #27 from DIAGNijmegen/m-grt123

MHub / GC - Add grt123 Model for lung cancer prediction based on lung nodules
MHubAI · Feb 28, 2024 · ada037d · ada037d
2 parents dfd6dd9 + 2f6a999
commit ada037d
Show file tree

Hide file tree

Showing 6 changed files with 357 additions and 0 deletions.
diff --git a/models/gc_grt123_lung_cancer/__init__.py b/models/gc_grt123_lung_cancer/__init__.py
@@ -0,0 +1 @@
+from .utils import *
diff --git a/models/gc_grt123_lung_cancer/config/default.yml b/models/gc_grt123_lung_cancer/config/default.yml
@@ -0,0 +1,30 @@
+general:
+  data_base_dir: /app/data
+  version: 1.0
+  description: grt123 lung nodule and lung cancer classifier default (dicom to json)
+
+execute:
+  - DicomImporter
+  - MhaConverter
+  - LungCancerClassifierRunner
+  - DataOrganizer
+
+modules:
+  DicomImporter:
+    source_dir: input_data
+    import_dir: sorted_data
+    sort_data: True
+    meta: 
+      mod: ct
+
+  MhaConverter:
+    engine: panimg
+
+  LungCancerClassifierRunner:
+    n_preprocessing_workers: 8
+
+  DataOrganizer:
+    target_dir: output_data
+    require_data_confirmation: true
+    targets:
+      - json-->[i:sid]/gc_grt123_lung_cancer_findings.json
diff --git a/models/gc_grt123_lung_cancer/dockerfiles/Dockerfile b/models/gc_grt123_lung_cancer/dockerfiles/Dockerfile
@@ -0,0 +1,35 @@
+FROM mhubai/base:latest
+
+# Specify/override authors label
+LABEL authors="[email protected]"
+
+# install required dependencies for grt123 algorithm including GPU support
+RUN pip3 install --no-cache-dir \
+    torch===2.0.1+cu118 -f https://download.pytorch.org/whl/torch_stable.html
+
+# Install grt123 algorithm and model weights
+#   - Git clone the algorithm repository for v2.0.0 (fixed to v2.0.0 tag commit on 2023/09/13)
+#   - We remove unnecessary files for a compacter docker layer
+#   - Subsequently we remove the .git directory to procuce a compacter docker layer, but keep the latest commit hash in the HEAD file
+RUN git clone --branch v2.0.0 https://github.com/DIAGNijmegen/bodyct-dsb2017-grt123.git /gc_grt123_lung_cancer && \
+    cd /gc_grt123_lung_cancer && git reset --hard 9a4ca0415c7fc1d3023a16650bf1cdce86f8bb59 && \
+    rm -rf /gc_grt123_lung_cancer/tests && \
+    rm -rf /gc_grt123_lung_cancer/training && \
+    rm -rf /gc_grt123_lung_cancer/processor && \
+    rm -rf /gc_grt123_lung_cancer/images && \
+    rm /gc_grt123_lung_cancer/README.md && \
+    rm /gc_grt123_lung_cancer/solution-grt123-team.pdf && \
+    mv /gc_grt123_lung_cancer/.git/HEAD /gc_grt123_lung_cancer && \
+    rm -rf /gc_grt123_lung_cancer/.git/* && \
+    mv /gc_grt123_lung_cancer/HEAD /gc_grt123_lung_cancer/.git
+
+# Import the MHub model definition
+ARG MHUB_MODELS_REPO
+RUN buildutils/import_mhub_model.sh gc_grt123_lung_cancer ${MHUB_MODELS_REPO}
+
+# Add lobe segmentation code base to python path
+ENV PYTHONPATH="/gc_grt123_lung_cancer:/app"
+
+# Default entrypoint
+ENTRYPOINT ["python3", "-m", "mhubio.run"]
+CMD ["--config", "/app/models/gc_grt123_lung_cancer/config/default.yml"]
diff --git a/models/gc_grt123_lung_cancer/meta.json b/models/gc_grt123_lung_cancer/meta.json
@@ -0,0 +1,171 @@
+{
+  "id": "2e67a3cc-4680-4058-bf4e-f965cf50f06f",
+  "name": "gc_grt123_lung_cancer",
+  "title": "Lung cancer risk estimation on thorax CT scans",
+  "summary": {
+    "description": "This algorithm analyzes non-contrast CT scans of the thorax and predicts the lung cancer risk. The model consists of two modules. The first one is a 3D region proposal network for nodule detection, which outputs all suspicious nodules for a subject. The second one selects the top five nodules based on the detection confidence, evaluates their cancer probabilities and combines them with a leaky noisy-or gate to obtain the probability of lung cancer for the subject. This model was the winner of the Data Science Bowl 2017 competition hosted on Kaggle.",
+    "inputs": [
+      {
+        "label": "CT",
+        "description": "Chest CT",
+        "format": "DICOM",
+        "modality": "CT",
+        "bodypartexamined": "Chest",
+        "slicethickness": "2.5mm",
+        "non-contrast": true,
+        "contrast": false
+      }
+    ],
+    "outputs": [
+      {
+        "type": "Prediction",
+        "valueType": "number",
+        "label": "Lung thorax cancer nodule probability score",
+        "description": "The likelihood of the presence of cancer nodules in the lungs.",
+        "classes": []
+      }
+    ],
+    "model": {
+      "architecture": "3D convolutional neural network",
+      "training": "supervised",
+      "cmpapproach": "3D"
+    },
+    "data": {
+      "training": {
+        "vol_samples": 2483
+      },
+      "evaluation": {
+        "vol_samples": 506
+      },
+      "public": true,
+      "external": false
+    }
+  },
+  "details": {
+    "name": " bodyct-dsb2017-grt123",
+    "version": "2.0.0",
+    "devteam": "DIAGNijmegen (Diagnostic Image Analysis Group, Radboud UMC, The Netherlands)",
+    "type": "3D Deep Leaky Noisy-or Network",
+    "date": {
+      "weights": "",
+      "code": "2023-07-04",
+      "pub": "2017-11-22"
+    },
+    "cite": "F. Liao, M. Liang, Z. Li, X. Hu and S. Song, 'Evaluate the Malignancy of Pulmonary Nodules Using the 3D Deep Leaky Noisy-or Network', in IEEE Transactions on Neural Networks and Learning Systems, vol. 30, no. 11, pp. 3484-3495, Nov. 2019, doi: 10.1109/TNNLS.2019.2892409.",
+    "license": {
+      "code": "MIT",
+      "weights": "MIT"
+    },
+    "publications": [
+      {
+        "title": "Evaluate the Malignancy of Pulmonary Nodules Using the 3D Deep Leaky Noisy-or Network",
+        "uri": "https://ieeexplore.ieee.org/abstract/document/8642524"
+      },
+      {
+        "title": "Deep Learning for Lung Cancer Detection on Screening CT Scans: Results of a Large-Scale Public Competition and an Observer Study with 11 Radiologists",
+        "uri": "https://pubmed.ncbi.nlm.nih.gov/34870218/"
+      }
+    ],
+    "github": "https://github.com/DIAGNijmegen/bodyct-dsb2017-grt123",
+    "zenodo": "",
+    "colab": "",
+    "slicer": false
+  },
+  "info": {
+    "use": {
+      "title": "Intended use",
+      "text": "This algorithm analyzes non-contrast CT scans of the thorax, first it segments the lungs, subsequently it detects lung nodules within the lungs, and finally it predicts the lung cancer risk for the individual nodules and the scan as a whole. The algorithm is also hosted on Grand Challenge [1] and was the winner of the Data Science Bowl 2017 challenge on Kaggle [2]. ",
+      "references": [
+        {
+          "label": "Lung cancer risk estimation algorithm on grand-challenge",
+          "uri": "https://grand-challenge.org/algorithms/dsb2017-grt123/"
+        },
+        {
+          "label": "Data Science Bowl 2017 challenge",
+          "uri": "https://www.kaggle.com/c/data-science-bowl-2017"
+        }
+      ],
+      "tables": []
+    },
+    "analyses": {
+      "title": "Evaluation",
+      "text": "The evaluation of the model was done on the Data Science Bowl 2017 (DSB) dataset hosted on Kaggle [1] (this is no longer publicly available). The nodule detection was evaluated on the validation of the DSB dataset, which contained data from 198 cases and there were 71 (7 nodules smaller than 6 mm are ruled out) nodules in total. The Free Response Operating Characteristic (FROC) is used to evaluate the performance of the nodule detection. The case cancer classification was evaluated using the Area Under the Curve (AUC) metric on the training set and the testing set of respectively 1397 and 506 patient cases. The AUC and FROC graphs can be viewed in the publication [2]. For the final evaluation on the Data Science Bowl 2017 challenge, the model's performance was evaluated using the logistic loss on a private external dataset of 300 low-dose CT images [3], containing 100 cancer-positive scans and 200 cancer-negative scans. See tables for a summary of the results.",
+      "references": [
+        {
+          "label": "Data Science Bowl 2017 challenge",
+          "uri": "https://www.kaggle.com/c/data-science-bowl-2017"
+        },
+        {
+          "label": "Evaluate the Malignancy of Pulmonary Nodules Using the 3D Deep Leaky Noisy-or Network",
+          "uri": "https://ieeexplore.ieee.org/abstract/document/8642524"
+        },
+        {
+          "label": "Evaluation paper external dataset Data Science Bowl 2017",
+          "uri": "https://pubmed.ncbi.nlm.nih.gov/34870218/"
+        }
+      ],
+      "tables": [
+        {
+          "label": "Case cancer classification results on the DSB 2017 dataset",
+          "entries": {
+            "AUC on training set": "0.90",
+            "AUC on test set": "0.87",
+            "Logistic loss on test set": "0.39975"
+          }
+        },
+        {
+          "label": "Case cancer classification results on private external evaluation dataset.",
+          "entries": {
+            "AUC on all scans": "0.877 (95% CI: 0.842, 0.910)"
+          }
+        }
+      ]
+    },
+    "evaluation": {
+      "title": "Evaluation data",
+      "text": "The model was evaluated on the testing set of 506 patient cases the Data Science Bowl 2017 (DSB) hosted on Kaggle [1] (this is no longer publicly available). ",
+      "references": [
+        {
+          "label": "Data Science Bowl 2017 challenge",
+          "uri": "https://www.kaggle.com/c/data-science-bowl-2017"
+        }
+      ],
+      "tables": []
+    },
+    "training": {
+      "title": "Training data",
+      "text": "Two lung scan datasets were used to train the model: the LUng Nodule Analysis 2016 (LUNA16) dataset [1] [2] and the training set of the Data Science Bowl 2017 (DSB) hosted on Kaggle [3] (this is no longer publicly available). Nodules smaller than 6 mm were removed from the LUNA16 annotations before training. The LUNA16 dataset includes 1186 nodule labels in 888 patient cases annotated by radiologists. The DSB dataset includes 1397 and 198 patient cases in its training and validation sets respectively. The LUNA16 dataset is a subset from the images from the LIDC/IDRI dataset [3] that is available under a Creative Commons Attribution 3.0 Unported License.",
+      "references": [
+        {
+          "label": "LUng Nodule Analysis 2016 dataset part 1",
+          "uri": "https://zenodo.org/record/3723295"
+        },
+        {
+          "label": "LUng Nodule Analysis 2016 dataset part 2",
+          "uri": "https://zenodo.org/record/4121926"
+        },
+        {
+          "label": "Data Science Bowl 2017 challenge",
+          "uri": "https://www.kaggle.com/c/data-science-bowl-2017"
+        },
+        {
+          "label": "The LIDC/IDRI dataset",
+          "uri": "https://www.cancerimagingarchive.net/collection/lidc-idri/"
+        }
+      ],
+      "tables": []
+    },
+    "ethics": {
+      "title": "",
+      "text": "",
+      "references": [],
+      "tables": []
+    },
+    "limitations": {
+      "title": "",
+      "text": "",
+      "references": [],
+      "tables": []
+    }
+  }
+}
diff --git a/models/gc_grt123_lung_cancer/utils/LungCancerClassifierRunner.py b/models/gc_grt123_lung_cancer/utils/LungCancerClassifierRunner.py
@@ -0,0 +1,119 @@
+"""
+--------------------------------------------------------
+Mhub / GC - Run Module for grt123 Lung Cancer Classifier
+--------------------------------------------------------
+
+--------------------------------------------------------
+Author: Sil van de Leemput
+Email:  [email protected]
+--------------------------------------------------------
+"""
+import torch.cuda
+from mhubio.core import Instance, InstanceData, IO, Module, ValueOutput, Meta
+
+from typing import Dict
+import json
+from pathlib import Path
+
+import torch
+
+
+@ValueOutput.Name('lncancerprob')
+@ValueOutput.Meta(Meta(min=0.0, max=1.0, type="probability"))
+@ValueOutput.Label('Lung Nodule cancer probability score.')
+@ValueOutput.Type(float)
+@ValueOutput.Description('The predicted cancer probability score for a single lung nodule detected by the algorithm')
+class LNCancerProb(ValueOutput):
+   pass
+
+
+@ValueOutput.Name('clcancerprob')
+@ValueOutput.Meta(Meta(min=0.0, max=1.0, type="probability"))
+@ValueOutput.Label('Case level cancer probability score.')
+@ValueOutput.Type(float)
+@ValueOutput.Description('The predicted cancer probability score for the whole case')
+class CLCancerProb(ValueOutput):
+    pass
+
+
+# This method cleans the raw results from the grt123 algorithm output and only keeps the relevant details
+def cleanup_json_report(data: Dict):
+    for key in ["trainingset1", "trainingset2"]:
+        del data["lungcad"][key]
+    for key in ["patientuid", "studyuid"]:
+        del data["imageinfo"][key]
+    data["findings"] = [
+        dict(
+            id=f["id"],
+            x=f["x"],
+            y=f["y"],
+            z=f["z"],
+            probability=f["probability"],
+            cancerprobability=f["cancerprobability"]
+        )
+        for f in data["findings"]
+    ]
+
+
+@IO.Config('n_preprocessing_workers', int, 6, the="number of preprocessing workers to use for the grt123 lung mask preprocessor")
+class LungCancerClassifierRunner(Module):
+
+    n_preprocessing_workers: int
+
+    @IO.Instance()
+    @IO.Input('in_data', 'mha:mod=ct', the='input ct scan')
+    @IO.Output('out_data', 'grt123_lung_cancer_findings.json', 'json:model=grt123LungCancerClassification', data='in_data', the='predicted nodules and lung cancer findings of the lung lobe')
+    @IO.OutputData('clcancerprob', CLCancerProb, the='Case level probability score')
+    @IO.OutputDatas('lncancerprobs', LNCancerProb, the='Individual lung nodule probability scores')
+    def task(self, instance: Instance, in_data: InstanceData, out_data: InstanceData, clcancerprob: CLCancerProb, lncancerprobs: LNCancerProb) -> None:
+        # create temporary directories for the preprocessed data and the cropped bounding boxes
+        tmp_path = Path(self.config.data.requestTempDir('grt123'))
+        tmp_output_bbox_dir = tmp_path / "bbox"
+        tmp_output_prep_dir = tmp_path / "prep"
+        tmp_output_bbox_dir.mkdir(exist_ok=True, parents=True)
+        tmp_output_prep_dir.mkdir(exist_ok=True, parents=True)
+
+        # determine the number of GPUs we can use
+        if torch.cuda.is_available():
+            self.log("Running with a GPU", "NOTICE")
+            n_gpu = 1
+        else:
+            self.log("Running on the CPU, might be slow...", "NOTICE")
+            n_gpu = 0
+
+        # Import the main module for the grt123 algorithm, which must be used for running the classification
+        import main
+
+        # apply grt123 algorithm
+        results = main.main(
+            skip_detect=False,
+            skip_preprocessing=False,
+            datapath=str(Path(in_data.abspath).parent),
+            outputdir=str(tmp_path),
+            output_bbox_dir=str(tmp_output_bbox_dir),
+            output_prep_dir=str(tmp_output_prep_dir),
+            n_gpu=n_gpu,
+            n_worker_preprocessing=self.n_preprocessing_workers,
+            data_filter=r".*.mha"
+        )
+
+        # retrieve classification results
+        assert len(results) > 0, "LungCancerClassifierRunner - Always expects at least one output report"
+        results_dict = results[0].to_json()
+        cleanup_json_report(results_dict)
+
+        # export to JSON (original json file)
+        self.log(f"Writing classification results to {out_data.abspath}", "NOTICE")
+        with open(out_data.abspath, "w") as f:
+            json.dump(results_dict, f, indent=4)
+
+        # set output value for case level cancer probability
+        clcancerprob.value = results_dict["cancerinfo"]["casecancerprobability"]
+
+        # set output values for nodule level cancer probabilities
+        for finding in results_dict["findings"]:
+            nodule_cancer_prob = LNCancerProb()
+            nodule_cancer_prob.meta = Meta(id=finding['id'], x=finding['x'], y=finding['y'], z=finding['z'], )
+            nodule_cancer_prob.description += f" (for nodule {finding['id']} at location ({finding['x']}, {finding['y']}, {finding['z']}))"
+            nodule_cancer_prob.value = finding["cancerprobability"]
+            lncancerprobs.add(nodule_cancer_prob)
diff --git a/models/gc_grt123_lung_cancer/utils/__init__.py b/models/gc_grt123_lung_cancer/utils/__init__.py
@@ -0,0 +1 @@
+from .LungCancerClassifierRunner import *