Skip to content

Commit

Permalink
Merge pull request #27 from DIAGNijmegen/m-grt123
Browse files Browse the repository at this point in the history
MHub / GC - Add grt123 Model for lung cancer prediction based on lung nodules
  • Loading branch information
LennyN95 authored Feb 28, 2024
2 parents dfd6dd9 + 2f6a999 commit ada037d
Show file tree
Hide file tree
Showing 6 changed files with 357 additions and 0 deletions.
1 change: 1 addition & 0 deletions models/gc_grt123_lung_cancer/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .utils import *
30 changes: 30 additions & 0 deletions models/gc_grt123_lung_cancer/config/default.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
general:
data_base_dir: /app/data
version: 1.0
description: grt123 lung nodule and lung cancer classifier default (dicom to json)

execute:
- DicomImporter
- MhaConverter
- LungCancerClassifierRunner
- DataOrganizer

modules:
DicomImporter:
source_dir: input_data
import_dir: sorted_data
sort_data: True
meta:
mod: ct

MhaConverter:
engine: panimg

LungCancerClassifierRunner:
n_preprocessing_workers: 8

DataOrganizer:
target_dir: output_data
require_data_confirmation: true
targets:
- json-->[i:sid]/gc_grt123_lung_cancer_findings.json
35 changes: 35 additions & 0 deletions models/gc_grt123_lung_cancer/dockerfiles/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
FROM mhubai/base:latest

# Specify/override authors label
LABEL authors="[email protected]"

# install required dependencies for grt123 algorithm including GPU support
RUN pip3 install --no-cache-dir \
torch===2.0.1+cu118 -f https://download.pytorch.org/whl/torch_stable.html

# Install grt123 algorithm and model weights
# - Git clone the algorithm repository for v2.0.0 (fixed to v2.0.0 tag commit on 2023/09/13)
# - We remove unnecessary files for a compacter docker layer
# - Subsequently we remove the .git directory to procuce a compacter docker layer, but keep the latest commit hash in the HEAD file
RUN git clone --branch v2.0.0 https://github.com/DIAGNijmegen/bodyct-dsb2017-grt123.git /gc_grt123_lung_cancer && \
cd /gc_grt123_lung_cancer && git reset --hard 9a4ca0415c7fc1d3023a16650bf1cdce86f8bb59 && \
rm -rf /gc_grt123_lung_cancer/tests && \
rm -rf /gc_grt123_lung_cancer/training && \
rm -rf /gc_grt123_lung_cancer/processor && \
rm -rf /gc_grt123_lung_cancer/images && \
rm /gc_grt123_lung_cancer/README.md && \
rm /gc_grt123_lung_cancer/solution-grt123-team.pdf && \
mv /gc_grt123_lung_cancer/.git/HEAD /gc_grt123_lung_cancer && \
rm -rf /gc_grt123_lung_cancer/.git/* && \
mv /gc_grt123_lung_cancer/HEAD /gc_grt123_lung_cancer/.git

# Import the MHub model definition
ARG MHUB_MODELS_REPO
RUN buildutils/import_mhub_model.sh gc_grt123_lung_cancer ${MHUB_MODELS_REPO}

# Add lobe segmentation code base to python path
ENV PYTHONPATH="/gc_grt123_lung_cancer:/app"

# Default entrypoint
ENTRYPOINT ["python3", "-m", "mhubio.run"]
CMD ["--config", "/app/models/gc_grt123_lung_cancer/config/default.yml"]
171 changes: 171 additions & 0 deletions models/gc_grt123_lung_cancer/meta.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
{
"id": "2e67a3cc-4680-4058-bf4e-f965cf50f06f",
"name": "gc_grt123_lung_cancer",
"title": "Lung cancer risk estimation on thorax CT scans",
"summary": {
"description": "This algorithm analyzes non-contrast CT scans of the thorax and predicts the lung cancer risk. The model consists of two modules. The first one is a 3D region proposal network for nodule detection, which outputs all suspicious nodules for a subject. The second one selects the top five nodules based on the detection confidence, evaluates their cancer probabilities and combines them with a leaky noisy-or gate to obtain the probability of lung cancer for the subject. This model was the winner of the Data Science Bowl 2017 competition hosted on Kaggle.",
"inputs": [
{
"label": "CT",
"description": "Chest CT",
"format": "DICOM",
"modality": "CT",
"bodypartexamined": "Chest",
"slicethickness": "2.5mm",
"non-contrast": true,
"contrast": false
}
],
"outputs": [
{
"type": "Prediction",
"valueType": "number",
"label": "Lung thorax cancer nodule probability score",
"description": "The likelihood of the presence of cancer nodules in the lungs.",
"classes": []
}
],
"model": {
"architecture": "3D convolutional neural network",
"training": "supervised",
"cmpapproach": "3D"
},
"data": {
"training": {
"vol_samples": 2483
},
"evaluation": {
"vol_samples": 506
},
"public": true,
"external": false
}
},
"details": {
"name": " bodyct-dsb2017-grt123",
"version": "2.0.0",
"devteam": "DIAGNijmegen (Diagnostic Image Analysis Group, Radboud UMC, The Netherlands)",
"type": "3D Deep Leaky Noisy-or Network",
"date": {
"weights": "",
"code": "2023-07-04",
"pub": "2017-11-22"
},
"cite": "F. Liao, M. Liang, Z. Li, X. Hu and S. Song, 'Evaluate the Malignancy of Pulmonary Nodules Using the 3D Deep Leaky Noisy-or Network', in IEEE Transactions on Neural Networks and Learning Systems, vol. 30, no. 11, pp. 3484-3495, Nov. 2019, doi: 10.1109/TNNLS.2019.2892409.",
"license": {
"code": "MIT",
"weights": "MIT"
},
"publications": [
{
"title": "Evaluate the Malignancy of Pulmonary Nodules Using the 3D Deep Leaky Noisy-or Network",
"uri": "https://ieeexplore.ieee.org/abstract/document/8642524"
},
{
"title": "Deep Learning for Lung Cancer Detection on Screening CT Scans: Results of a Large-Scale Public Competition and an Observer Study with 11 Radiologists",
"uri": "https://pubmed.ncbi.nlm.nih.gov/34870218/"
}
],
"github": "https://github.com/DIAGNijmegen/bodyct-dsb2017-grt123",
"zenodo": "",
"colab": "",
"slicer": false
},
"info": {
"use": {
"title": "Intended use",
"text": "This algorithm analyzes non-contrast CT scans of the thorax, first it segments the lungs, subsequently it detects lung nodules within the lungs, and finally it predicts the lung cancer risk for the individual nodules and the scan as a whole. The algorithm is also hosted on Grand Challenge [1] and was the winner of the Data Science Bowl 2017 challenge on Kaggle [2]. ",
"references": [
{
"label": "Lung cancer risk estimation algorithm on grand-challenge",
"uri": "https://grand-challenge.org/algorithms/dsb2017-grt123/"
},
{
"label": "Data Science Bowl 2017 challenge",
"uri": "https://www.kaggle.com/c/data-science-bowl-2017"
}
],
"tables": []
},
"analyses": {
"title": "Evaluation",
"text": "The evaluation of the model was done on the Data Science Bowl 2017 (DSB) dataset hosted on Kaggle [1] (this is no longer publicly available). The nodule detection was evaluated on the validation of the DSB dataset, which contained data from 198 cases and there were 71 (7 nodules smaller than 6 mm are ruled out) nodules in total. The Free Response Operating Characteristic (FROC) is used to evaluate the performance of the nodule detection. The case cancer classification was evaluated using the Area Under the Curve (AUC) metric on the training set and the testing set of respectively 1397 and 506 patient cases. The AUC and FROC graphs can be viewed in the publication [2]. For the final evaluation on the Data Science Bowl 2017 challenge, the model's performance was evaluated using the logistic loss on a private external dataset of 300 low-dose CT images [3], containing 100 cancer-positive scans and 200 cancer-negative scans. See tables for a summary of the results.",
"references": [
{
"label": "Data Science Bowl 2017 challenge",
"uri": "https://www.kaggle.com/c/data-science-bowl-2017"
},
{
"label": "Evaluate the Malignancy of Pulmonary Nodules Using the 3D Deep Leaky Noisy-or Network",
"uri": "https://ieeexplore.ieee.org/abstract/document/8642524"
},
{
"label": "Evaluation paper external dataset Data Science Bowl 2017",
"uri": "https://pubmed.ncbi.nlm.nih.gov/34870218/"
}
],
"tables": [
{
"label": "Case cancer classification results on the DSB 2017 dataset",
"entries": {
"AUC on training set": "0.90",
"AUC on test set": "0.87",
"Logistic loss on test set": "0.39975"
}
},
{
"label": "Case cancer classification results on private external evaluation dataset.",
"entries": {
"AUC on all scans": "0.877 (95% CI: 0.842, 0.910)"
}
}
]
},
"evaluation": {
"title": "Evaluation data",
"text": "The model was evaluated on the testing set of 506 patient cases the Data Science Bowl 2017 (DSB) hosted on Kaggle [1] (this is no longer publicly available). ",
"references": [
{
"label": "Data Science Bowl 2017 challenge",
"uri": "https://www.kaggle.com/c/data-science-bowl-2017"
}
],
"tables": []
},
"training": {
"title": "Training data",
"text": "Two lung scan datasets were used to train the model: the LUng Nodule Analysis 2016 (LUNA16) dataset [1] [2] and the training set of the Data Science Bowl 2017 (DSB) hosted on Kaggle [3] (this is no longer publicly available). Nodules smaller than 6 mm were removed from the LUNA16 annotations before training. The LUNA16 dataset includes 1186 nodule labels in 888 patient cases annotated by radiologists. The DSB dataset includes 1397 and 198 patient cases in its training and validation sets respectively. The LUNA16 dataset is a subset from the images from the LIDC/IDRI dataset [3] that is available under a Creative Commons Attribution 3.0 Unported License.",
"references": [
{
"label": "LUng Nodule Analysis 2016 dataset part 1",
"uri": "https://zenodo.org/record/3723295"
},
{
"label": "LUng Nodule Analysis 2016 dataset part 2",
"uri": "https://zenodo.org/record/4121926"
},
{
"label": "Data Science Bowl 2017 challenge",
"uri": "https://www.kaggle.com/c/data-science-bowl-2017"
},
{
"label": "The LIDC/IDRI dataset",
"uri": "https://www.cancerimagingarchive.net/collection/lidc-idri/"
}
],
"tables": []
},
"ethics": {
"title": "",
"text": "",
"references": [],
"tables": []
},
"limitations": {
"title": "",
"text": "",
"references": [],
"tables": []
}
}
}
119 changes: 119 additions & 0 deletions models/gc_grt123_lung_cancer/utils/LungCancerClassifierRunner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
"""
--------------------------------------------------------
Mhub / GC - Run Module for grt123 Lung Cancer Classifier
--------------------------------------------------------
--------------------------------------------------------
Author: Sil van de Leemput
Email: [email protected]
--------------------------------------------------------
"""
import torch.cuda
from mhubio.core import Instance, InstanceData, IO, Module, ValueOutput, Meta

from typing import Dict
import json
from pathlib import Path

import torch


@ValueOutput.Name('lncancerprob')
@ValueOutput.Meta(Meta(min=0.0, max=1.0, type="probability"))
@ValueOutput.Label('Lung Nodule cancer probability score.')
@ValueOutput.Type(float)
@ValueOutput.Description('The predicted cancer probability score for a single lung nodule detected by the algorithm')
class LNCancerProb(ValueOutput):
pass


@ValueOutput.Name('clcancerprob')
@ValueOutput.Meta(Meta(min=0.0, max=1.0, type="probability"))
@ValueOutput.Label('Case level cancer probability score.')
@ValueOutput.Type(float)
@ValueOutput.Description('The predicted cancer probability score for the whole case')
class CLCancerProb(ValueOutput):
pass


# This method cleans the raw results from the grt123 algorithm output and only keeps the relevant details
def cleanup_json_report(data: Dict):
for key in ["trainingset1", "trainingset2"]:
del data["lungcad"][key]
for key in ["patientuid", "studyuid"]:
del data["imageinfo"][key]
data["findings"] = [
dict(
id=f["id"],
x=f["x"],
y=f["y"],
z=f["z"],
probability=f["probability"],
cancerprobability=f["cancerprobability"]
)
for f in data["findings"]
]


@IO.Config('n_preprocessing_workers', int, 6, the="number of preprocessing workers to use for the grt123 lung mask preprocessor")
class LungCancerClassifierRunner(Module):

n_preprocessing_workers: int

@IO.Instance()
@IO.Input('in_data', 'mha:mod=ct', the='input ct scan')
@IO.Output('out_data', 'grt123_lung_cancer_findings.json', 'json:model=grt123LungCancerClassification', data='in_data', the='predicted nodules and lung cancer findings of the lung lobe')
@IO.OutputData('clcancerprob', CLCancerProb, the='Case level probability score')
@IO.OutputDatas('lncancerprobs', LNCancerProb, the='Individual lung nodule probability scores')
def task(self, instance: Instance, in_data: InstanceData, out_data: InstanceData, clcancerprob: CLCancerProb, lncancerprobs: LNCancerProb) -> None:
# create temporary directories for the preprocessed data and the cropped bounding boxes
tmp_path = Path(self.config.data.requestTempDir('grt123'))
tmp_output_bbox_dir = tmp_path / "bbox"
tmp_output_prep_dir = tmp_path / "prep"
tmp_output_bbox_dir.mkdir(exist_ok=True, parents=True)
tmp_output_prep_dir.mkdir(exist_ok=True, parents=True)

# determine the number of GPUs we can use
if torch.cuda.is_available():
self.log("Running with a GPU", "NOTICE")
n_gpu = 1
else:
self.log("Running on the CPU, might be slow...", "NOTICE")
n_gpu = 0

# Import the main module for the grt123 algorithm, which must be used for running the classification
import main

# apply grt123 algorithm
results = main.main(
skip_detect=False,
skip_preprocessing=False,
datapath=str(Path(in_data.abspath).parent),
outputdir=str(tmp_path),
output_bbox_dir=str(tmp_output_bbox_dir),
output_prep_dir=str(tmp_output_prep_dir),
n_gpu=n_gpu,
n_worker_preprocessing=self.n_preprocessing_workers,
data_filter=r".*.mha"
)

# retrieve classification results
assert len(results) > 0, "LungCancerClassifierRunner - Always expects at least one output report"
results_dict = results[0].to_json()
cleanup_json_report(results_dict)

# export to JSON (original json file)
self.log(f"Writing classification results to {out_data.abspath}", "NOTICE")
with open(out_data.abspath, "w") as f:
json.dump(results_dict, f, indent=4)

# set output value for case level cancer probability
clcancerprob.value = results_dict["cancerinfo"]["casecancerprobability"]

# set output values for nodule level cancer probabilities
for finding in results_dict["findings"]:
nodule_cancer_prob = LNCancerProb()
nodule_cancer_prob.meta = Meta(id=finding['id'], x=finding['x'], y=finding['y'], z=finding['z'], )
nodule_cancer_prob.description += f" (for nodule {finding['id']} at location ({finding['x']}, {finding['y']}, {finding['z']}))"
nodule_cancer_prob.value = finding["cancerprobability"]
lncancerprobs.add(nodule_cancer_prob)
1 change: 1 addition & 0 deletions models/gc_grt123_lung_cancer/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .LungCancerClassifierRunner import *

0 comments on commit ada037d

Please sign in to comment.