diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9fe17bc --- /dev/null +++ b/.gitignore @@ -0,0 +1,129 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..1409bb9 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2019 Araik Tamazian + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..d0b3b93 --- /dev/null +++ b/README.md @@ -0,0 +1,24 @@ +# kaggle-rsna-ihd +In Kaggle: RSNA Intracranial Hemorrhage Detection competition, the challenge was to build an algorithm to detect acute intracranial hemorrhage and its subtypes. + + +kaggle-rsna-ihd is a Python library for + +## Installation + +Use the package manager [pip](https://pip.pypa.io/en/stable/) to install foobar. + +```bash +pip install git+https://github.com/atamazian/kaggle_rsna_ihd.git +``` + +## Usage +See example notebook. + +## Contributing +Pull requests are welcome. For major changes, please open an issue first to discuss what you would like to change. + +Please make sure to update tests as appropriate. + +## License +See LICENSE. \ No newline at end of file diff --git a/demo/__init__.py b/demo/__init__.py new file mode 100644 index 0000000..3c1dba7 --- /dev/null +++ b/demo/__init__.py @@ -0,0 +1,3 @@ +import os + +DEMO_DIR = os.path.dirname(__file__) \ No newline at end of file diff --git a/demo/train.csv b/demo/train.csv new file mode 100644 index 0000000..b707b8d --- /dev/null +++ b/demo/train.csv @@ -0,0 +1,19 @@ +ID,Label +ID_000000000_epidural,0 +ID_000000000_intraparenchymal,0 +ID_000000000_intraventricular,0 +ID_000000000_subarachnoid,0 +ID_000000000_subdural,0 +ID_000000000_any,0 +ID_000000001_epidural,0 +ID_000000001_intraparenchymal,0 +ID_000000001_intraventricular,0 +ID_000000001_subarachnoid,0 +ID_000000001_subdural,0 +ID_000000001_any,0 +ID_000000002_epidural,0 +ID_000000002_intraparenchymal,0 +ID_000000002_intraventricular,0 +ID_000000002_subarachnoid,0 +ID_000000002_subdural,0 +ID_000000002_any,0 diff --git a/demo/train_images/ID_000000000.png b/demo/train_images/ID_000000000.png new file mode 100644 index 0000000..ae3e9b8 Binary files /dev/null and b/demo/train_images/ID_000000000.png differ diff --git a/demo/train_images/ID_000000001.png b/demo/train_images/ID_000000001.png new file mode 100644 index 0000000..ae3e9b8 Binary files /dev/null and b/demo/train_images/ID_000000001.png differ diff --git a/demo/train_images/ID_000000002.png b/demo/train_images/ID_000000002.png new file mode 100644 index 0000000..ae3e9b8 Binary files /dev/null and b/demo/train_images/ID_000000002.png differ diff --git a/kaggle_rsna_ihd/__init__.py b/kaggle_rsna_ihd/__init__.py new file mode 100644 index 0000000..b794fd4 --- /dev/null +++ b/kaggle_rsna_ihd/__init__.py @@ -0,0 +1 @@ +__version__ = '0.1.0' diff --git a/kaggle_rsna_ihd/datasets.py b/kaggle_rsna_ihd/datasets.py new file mode 100644 index 0000000..20fa404 --- /dev/null +++ b/kaggle_rsna_ihd/datasets.py @@ -0,0 +1,144 @@ +import logging +import multiprocessing as mproc +import os +from math import ceil + +import pandas as pd +from pytorch_lightning import LightningDataModule +from torch.utils.data import DataLoader, Dataset +from monai import transforms as T + +TRAIN_TRANSFORM = T.Compose( + [ + T.AddChannel(), + T.CenterSpatialCrop((200, 200)), + T.RandFlip(prob=0.5, spatial_axis=0), + T.ScaleIntensity(), + T.EnsureType(), + ] +) + +VALID_TRANSFORM = T.Compose( + [ + T.AddChannel(), + T.CenterSpatialCrop((200, 200)), + T.ScaleIntensity(), + T.EnsureType(), + ] +) + + +class IHDDataset(Dataset): + def __init__( + self, + path_csv: str, + path_img_dir: str, + transforms=None, + mode: str = "train", + split: float = 0.8, + ): + self.path_img_dir = path_img_dir + self.transforms = transforms + self.mode = mode + + self.data = pd.read_csv(path_csv) + self.data["image_id"] = self.data["ID"].apply( + lambda x: "_".join(x.split("_")[:-1]) + ".png" + ) + self.data["type"] = self.data["ID"].apply(lambda x: x.split("_")[2]) + self.data = ( + self.data[["Label", "image_id", "type"]] + .drop_duplicates() + .pivot(index="image_id", columns="type", values="Label") + .reset_index() + ) + label_cols = [ + "epidural", + "intraparenchymal", + "intraventricular", + "subarachnoid", + "subdural", + "any", + ] + + # shuffle data + self.data = self.data.sample(frac=1, random_state=42).reset_index(drop=True) + + # split dataset + assert 0.0 <= split <= 1.0 + frac = int(ceil(split * len(self.data))) + self.data = self.data[:frac] if mode == "train" else self.data[frac:] + self.img_names = list(self.data["image_id"]) + self.labels = list(self.data[label_cols].values) + + def __getitem__(self, idx: int) -> tuple: + img_path = os.path.join(self.path_img_dir, self.img_names[idx]) + assert os.path.isfile(img_path) + label = self.labels[idx] + img = T.LoadImage(image_only=True)(img_path) + + if self.transforms: + img = self.transforms(img) + return img, label + + def __len__(self) -> int: + return len(self.data) + + +class IHDDataModule(LightningDataModule): + def __init__( + self, + path_csv: str, + path_img_dir: str, + train_transform=TRAIN_TRANSFORM, + valid_transform=VALID_TRANSFORM, + batch_size: int = 128, + split: float = 0.8, + ): + super().__init__() + self.path_csv = path_csv + self.path_img_dir = path_img_dir + self.train_transform = train_transform + self.valid_transform = valid_transform + self.batch_size = batch_size + self.split = split + + def prepare_data(self): + pass + + def setup(self, stage=None): + self.train_dataset = IHDDataset( + self.path_csv, + self.path_img_dir, + split=self.split, + mode="train", + transforms=self.train_transform, + ) + logging.info(f"training dataset: {len(self.train_dataset)}") + self.valid_dataset = IHDDataset( + self.path_csv, + self.path_img_dir, + split=self.split, + mode="valid", + transforms=self.valid_transform, + ) + logging.info(f"validation dataset: {len(self.valid_dataset)}") + + def train_dataloader(self): + return DataLoader( + self.train_dataset, + batch_size=self.batch_size, + num_workers=mproc.cpu_count(), + shuffle=True, + ) + + def val_dataloader(self): + return DataLoader( + self.valid_dataset, + batch_size=self.batch_size, + num_workers=mproc.cpu_count(), + shuffle=False, + ) + + def test_dataloader(self): + pass diff --git a/kaggle_rsna_ihd/models.py b/kaggle_rsna_ihd/models.py new file mode 100644 index 0000000..658e186 --- /dev/null +++ b/kaggle_rsna_ihd/models.py @@ -0,0 +1,49 @@ +from typing import Union + +import torch +from pytorch_lightning import LightningModule +from torch import nn +from torch.nn import functional as F +import timm + + +class IHDModel(LightningModule): + """Basic IHD model. + >>> model = IHDModel("resnet18") + """ + + def __init__( + self, model: Union[str, nn.Module], pretrained: bool = True, lr: float = 1e-4 + ): + super().__init__() + if isinstance(model, str): + self.model = timm.create_model( + model, pretrained=pretrained, num_classes=6, in_chans=1 + ) + else: + self.model = model + self.learn_rate = lr + self.loss_fn = F.binary_cross_entropy_with_logits + + def forward(self, x): + return self.model(x) + + def training_step(self, batch, batch_idx): + x, y = batch + y_hat = self(x) + loss = self.loss_fn(y_hat, y.float()) + self.log("train_loss", loss, prog_bar=True) + return loss + + def validation_step(self, batch, batch_idx): + x, y = batch + y_hat = self(x) + loss = self.loss_fn(y_hat, y.float()) + self.log("valid_loss", loss, prog_bar=False) + + def configure_optimizers(self): + optimizer = torch.optim.AdamW(self.model.parameters(), lr=self.learn_rate) + scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( + optimizer, self.trainer.max_epochs, 0 + ) + return [optimizer], [scheduler] diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..55583bf --- /dev/null +++ b/pytest.ini @@ -0,0 +1,8 @@ +[pytest] +python_files=test*.py +addopts= + --cov kaggle_rsna_ihd + --cov-report term-missing + --cov-report=xml + +testpaths = tests \ No newline at end of file diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..2bcd70e --- /dev/null +++ b/setup.cfg @@ -0,0 +1,2 @@ +[flake8] +max-line-length = 88 diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..1affcb7 --- /dev/null +++ b/setup.py @@ -0,0 +1,58 @@ +import codecs +import os +import re + +from setuptools import setup, find_packages + +with open("README.md", "r") as readme_file: + long_description = readme_file.read() + +here = os.path.abspath(os.path.dirname(__file__)) + + +def read(*parts): + with codecs.open(os.path.join(here, *parts), "r") as fp: + return fp.read() + + +def find_version(*file_paths): + version_file = read(*file_paths) + version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", version_file, re.M) + if version_match: + return version_match.group(1) + raise RuntimeError("Unable to find version string.") + + +setup( + name="kaggle-rsna-ihd", + version=find_version("kaggle_rsna_ihd", "__init__.py"), + author="Araik Tamazian", + description="A package to aid in building models for RSNA Intracranial Hemorrhage Detection task.", + license="MIT", + long_description=long_description, + long_description_content_type="text/markdown", + url="https://github.com/atamazian/kaggle_rsna_ihd", + packages=find_packages(exclude=["demo", "tests"]), + install_requires=[ + "torch>=1.8", + "torchvision", + "pytorch-lightning>=1.5.0", + "monai", + "timm", + "pandas", + "scikit-learn>=1.0", + ], + python_requires=">=3.7", + classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "Topic :: Multimedia", + "Topic :: Multimedia :: Sound/Audio", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Topic :: Scientific/Engineering :: Image Recognition", + ], +) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_datasets.py b/tests/test_datasets.py new file mode 100644 index 0000000..c8cf61d --- /dev/null +++ b/tests/test_datasets.py @@ -0,0 +1,27 @@ +import os +import numpy +from kaggle_rsna_ihd.datasets import IHDDataModule, IHDDataset +from demo import DEMO_DIR + +print(DEMO_DIR) + +def test_dataset(path_data=DEMO_DIR): + dataset = IHDDataset( + path_csv=os.path.join(path_data, "train.csv"), + path_img_dir=os.path.join(path_data, "train_images"), + ) + img, lb = dataset[0] + assert isinstance(img, numpy.ndarray) + + +def test_datamodule(path_data=DEMO_DIR): + dm = IHDDataModule( + path_csv=os.path.join(path_data, "train.csv"), + path_img_dir=os.path.join(path_data, "train_images"), + ) + dm.setup() + + for imgs, lbs in dm.train_dataloader(): + assert len(imgs) + assert len(lbs) + break \ No newline at end of file diff --git a/tests/test_models.py b/tests/test_models.py new file mode 100644 index 0000000..15a2e18 --- /dev/null +++ b/tests/test_models.py @@ -0,0 +1,22 @@ +import os +from pytorch_lightning import Trainer +from kaggle_rsna_ihd.datasets import IHDDataModule +from kaggle_rsna_ihd.models import IHDModel +from demo import DEMO_DIR + +def test_model(tmpdir, path_data=DEMO_DIR): + dm = IHDDataModule( + path_csv=os.path.join(path_data, "train.csv"), + path_img_dir=os.path.join(path_data, "train_images"), + batch_size=1, + split=0.6, + ) + model = IHDModel(model="resnet18") + + trainer = Trainer( + default_root_dir=tmpdir, + fast_dev_run=True, + gpus=0, + ) + dm.setup() + trainer.fit(model, datamodule=dm) \ No newline at end of file