Continvvm · ranarag · Jun 24, 2023 · Jun 25, 2023
diff --git a/continuum/datasets/__init__.py b/continuum/datasets/__init__.py
@@ -11,7 +11,7 @@
 from continuum.datasets.cifar100 import CIFAR100
 from continuum.datasets.core50 import Core50, Core50v2_79, Core50v2_196, Core50v2_391
 from continuum.datasets.fellowship import CIFARFellowship, Fellowship, MNISTFellowship
-from continuum.datasets.imagenet import ImageNet100, ImageNet1000, TinyImageNet200
+from continuum.datasets.imagenet import ImageNet100, ImageNet1000, TinyImageNet200, ImageNetR
 from continuum.datasets.synbols import Synbols
 from continuum.datasets.nlp import MultiNLI
 from continuum.datasets.pytorch import (

diff --git a/continuum/datasets/imagenet.py b/continuum/datasets/imagenet.py
@@ -3,9 +3,9 @@
 
 import numpy as np
 from torchvision import transforms
-
+import yaml
 from continuum.datasets import ImageFolderDataset, _ContinuumDataset
-from continuum.download import download, unzip
+from continuum.download import download, unzip, untar
 from continuum.tasks import TaskType
 
 
@@ -125,6 +125,78 @@ def _parse_subset(
         return subset  # type: ignore
 
 
+class ImageNetR(_ContinuumDataset):
+    """Imagenet_R dataset.
+    - 200 classes
+    - 500 images per class
+    - size 224x224
+    """
+
+    url = "https://people.eecs.berkeley.edu/~hendrycks/imagenet-r.tar"
+    num_classes = 200
+
+    def _download(self):
+        path = os.path.join(self.data_path, "imagenet-r")
+        if not os.path.exists(path):
+            if not os.path.exists(f"{path}.tar"):
+                download(self.url, self.data_path)
+                untar(f"{path}.tar")
+
+        """ Download the yaml files with train and test splits from the CODA-Prompt repository"""
+        if not os.path.exists(os.path.join(path, 'imagenet-r_train.yaml')):
+            download('https://raw.githubusercontent.com/GT-RIPL/CODA-Prompt/main/dataloaders/splits/imagenet-r_train.yaml', path)
+
+        if not os.path.exists(os.path.join(path, 'imagenet-r_test.yaml')):
+            download('https://raw.githubusercontent.com/GT-RIPL/CODA-Prompt/main/dataloaders/splits/imagenet-r_test.yaml', path)
+
+        if not os.path.exists(os.path.join(path, 'class_mapping.txt')):
+            download('https://gist.githubusercontent.com/ranarag/6620c8fa7da24e1f56f7cdba88d6343a/raw/1d22f7b4902efa22b77c88879579057ac88feb4d/class_mapping.txt', path)
+
+    @property
+    def data_type(self) -> TaskType:
+        return TaskType.IMAGE_PATH
+
+    def get_data(self) -> Tuple[np.ndarray, np.ndarray, Optional[np.ndarray]]:
+        path = os.path.join(self.data_path, "imagenet-r")
+        with open(os.path.join(path, 'class_mapping.txt'), 'r') as fid:
+            class_mapping = fid.read().split('\n')
+        class_mapping = {x.split(' ')[0]: x.split(' ')[1] for x in class_mapping}
+        if self.train:
+            data_config = yaml.load(open(os.path.join(path,'imagenet-r_train.yaml'), 'r'), \
+                                    Loader=yaml.Loader)
+        else:
+            data_config = yaml.load(open(os.path.join(path,'imagenet-r_test.yaml'), 'r'), \
+                                    Loader=yaml.Loader)
+
+        x = []
+        y = []
+        self.classes = [" "] * 200
+        for idx, fname in enumerate(data_config['data']):
+            data_fname = '/'.join(fname.split('/')[2:])
+            if self.classes[int(data_config['targets'][idx])] == " ":
+                self.classes[int(data_config['targets'][idx])] = class_mapping[data_fname.split('/')[0]]
+            x.append(os.path.join(path, data_fname))
+            y.append(int(data_config['targets'][idx]))
+
+        x = np.array(x)
+        y = np.array(y)
+        for i in range(200):
+            assert self.classes[i] != " ", "Class not found"
+        return x, y, None
+
+
+    @property
+    def transformations(self):
+        """Default transformations if nothing is provided to the scenario."""
+        return [
+            transforms.Resize((224, 224)),
+            transforms.ToTensor(),
+            transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+        ]
+
+
+
+
 class TinyImageNet200(_ContinuumDataset):
     """Smaller version of ImageNet.
 

diff --git a/requirements.txt b/requirements.txt
@@ -13,3 +13,4 @@ h5py>=3.1.0
 requests>=2.24.0
 ImageHash>=4.2.1
 datasets>=1.6.0
+pyaml
diff --git a/tests/test_imagenet.py b/tests/test_imagenet.py
@@ -2,7 +2,7 @@
 
 import pytest
 
-from continuum.datasets import ImageNet100
+from continuum.datasets import ImageNet100, ImageNet_R
 from continuum.scenarios import ClassIncremental
 
 
@@ -11,29 +11,58 @@
     False: 5000    # test
 }
 
-
+nb_images_per_subset_imagenet_r = {
+    True: 24000,  # train
+    False: 6000    # test
+}
 @pytest.fixture
 def ImageNet100Test(tmpdir):
     folder = os.path.join(tmpdir, "imagenet100test")
     os.makedirs(folder)
     return ImageNet100(folder, data_subset=None, download=True, train=False)
 
 
+@pytest.fixture
+def ImageNet_RTrain(tmpdir):
+    folder = os.path.join(tmpdir, "imagenet_rtrain")
+    os.makedirs(folder)
+    return ImageNet_R(folder, download=True, train=True)
+
+
+@pytest.fixture
+def ImageNet_RTest(tmpdir):
+    folder = os.path.join(tmpdir, "imagenet_rtest")
+    os.makedirs(folder)
+    return ImageNet_R(folder, download=True, train=False)
+
+
 @pytest.fixture
 def ImageNet100Train(tmpdir):
     folder = os.path.join(tmpdir, "imagenet100train")
     os.makedirs(folder)
     return ImageNet100(folder, data_subset=None, download=True, train=True)
 
 
+@pytest.mark.parametrize("train", [True, False])
+def test_parsing_imagenet_r(ImageNet_RTrain, ImageNet_RTest, train):
+    dataset = ImageNet_RTrain if train else ImageNet_RTest
+    x, y, t = dataset.get_data()
+
+    assert all("train" if train else "test" in path for path in x)
+
 @pytest.mark.parametrize("train", [True, False])
 def test_parsing_imagenet100(ImageNet100Train, ImageNet100Test, train):
     dataset = ImageNet100Train if train else ImageNet100Test
     x, y, t = dataset.get_data()
 
     assert all("train" if train else "test" in path for path in x)
 
+@pytest.mark.parametrize("train", [True, False])
+def test_nb_imagenet_r(ImageNet_RTrain, ImageNet_RTest, train):
+    dataset = ImageNet_RTrain if train else ImageNet_RTest
+    x, y, t = dataset.get_data()
 
+    assert len(x) == nb_images_per_subset_imagenet_r[train]
 
 @pytest.mark.parametrize("train", [True, False])
 def test_nb_imagenet100(ImageNet100Train, ImageNet100Test, train):
@@ -43,6 +72,24 @@ def test_nb_imagenet100(ImageNet100Train, ImageNet100Test, train):
     assert len(x) == nb_images_per_subset[train]
 
 
+# @pytest.mark.parametrize("train,div", [
+#     (True, 1), (True, 2),
+#     (False, 1), (True, 2)
+# ])
+# def test_customsubset_imagenet_r(ImageNet_RTrain, ImageNet_RTest, train, div):
+#     dataset = ImageNet_RTrain if train else ImageNet_RTest
+#     x, y, t = dataset.get_data()
+
+#     new_x = x[:len(x) // div]
+#     new_y = y[:len(y) // div]
+
+#     subset = ImageNet_R(dataset.data_path, data_subset=(new_x, new_y), download=False, train=train)
+#     x2, y2, t2 = subset.get_data()
+
+#     assert len(x) // div == len(x2)
+
+
+
 @pytest.mark.parametrize("train,div", [
     (True, 1), (True, 2),
     (False, 1), (True, 2)