From 8932953c5bbb634a98294a1aeec769936bb6591a Mon Sep 17 00:00:00 2001
From: Ori Kronfeld <ori.kronfeld@weizmann.ac.il>
Date: Mon, 25 Nov 2024 18:15:56 +0200
Subject: [PATCH 01/42] Update for the dataloder in case of distributed sampler

---
 src/scvi/dataloaders/_ann_dataloader.py    |  8 ++++
 src/scvi/dataloaders/_concat_dataloader.py | 12 ++++++
 src/scvi/dataloaders/_samplers.py          | 16 +++++++-
 tests/dataloaders/test_dataloaders.py      | 46 ++++++++++++++++++++--
 4 files changed, 77 insertions(+), 5 deletions(-)

diff --git a/src/scvi/dataloaders/_ann_dataloader.py b/src/scvi/dataloaders/_ann_dataloader.py
index 27e17302d5..b978239569 100644
--- a/src/scvi/dataloaders/_ann_dataloader.py
+++ b/src/scvi/dataloaders/_ann_dataloader.py
@@ -118,12 +118,17 @@ def __init__(
                     drop_last=drop_last,
                 )
             else:
+                if "save_path" not in kwargs:
+                    kwargs["save_path"] = "/."
+                if "num_processes" not in kwargs:
+                    kwargs["num_processes"] = 1
                 sampler = BatchDistributedSampler(
                     self.dataset,
                     batch_size=batch_size,
                     drop_last=drop_last,
                     drop_dataset_tail=drop_dataset_tail,
                     shuffle=shuffle,
+                    **kwargs
                 )
             # do not touch batch size here, sampler gives batched indices
             # This disables PyTorch automatic batching, which is necessary
@@ -135,4 +140,7 @@ def __init__(
         if iter_ndarray:
             self.kwargs.update({"collate_fn": lambda x: x})
 
+        for redundant_key in ["save_path","num_processes"]:
+            if redundant_key in self.kwargs:
+                self.kwargs.pop(redundant_key)
         super().__init__(self.dataset, **self.kwargs)
diff --git a/src/scvi/dataloaders/_concat_dataloader.py b/src/scvi/dataloaders/_concat_dataloader.py
index 9aa9071a85..66802128fe 100644
--- a/src/scvi/dataloaders/_concat_dataloader.py
+++ b/src/scvi/dataloaders/_concat_dataloader.py
@@ -25,6 +25,13 @@ class ConcatDataLoader(DataLoader):
         Dictionary with keys representing keys in data registry (``adata_manager.data_registry``)
         and value equal to desired numpy loading type (later made into torch tensor).
         If ``None``, defaults to all registered data.
+    drop_last
+        If `True` and the dataset is not evenly divisible by `batch_size`, the last
+        incomplete batch is dropped. If `False` and the dataset is not evenly divisible
+        by `batch_size`, then the last batch will be smaller than `batch_size`.
+    distributed_sampler
+        ``EXPERIMENTAL`` Whether to use :class:`~scvi.dataloaders.BatchDistributedSampler` as the
+        sampler. If `True`, `sampler` must be `None`.
     data_loader_kwargs
         Keyword arguments for :class:`~torch.utils.data.DataLoader`
     """
@@ -37,6 +44,7 @@ def __init__(
         batch_size: int = 128,
         data_and_attributes: dict | None = None,
         drop_last: bool | int = False,
+        distributed_sampler: bool = False,
         **data_loader_kwargs,
     ):
         self.adata_manager = adata_manager
@@ -56,11 +64,15 @@ def __init__(
                     batch_size=batch_size,
                     data_and_attributes=data_and_attributes,
                     drop_last=drop_last,
+                    distributed_sampler=distributed_sampler,
                     **self.dataloader_kwargs,
                 )
             )
         lens = [len(dl) for dl in self.dataloaders]
         self.largest_dl = self.dataloaders[np.argmax(lens)]
+        for redundant_key in ["save_path","num_processes"]:
+            if redundant_key in data_loader_kwargs:
+                data_loader_kwargs.pop(redundant_key)
         super().__init__(self.largest_dl, **data_loader_kwargs)
 
     def __len__(self):
diff --git a/src/scvi/dataloaders/_samplers.py b/src/scvi/dataloaders/_samplers.py
index 283b2586e2..7137bb7406 100644
--- a/src/scvi/dataloaders/_samplers.py
+++ b/src/scvi/dataloaders/_samplers.py
@@ -1,5 +1,5 @@
 from torch.utils.data import Dataset, DistributedSampler
-
+import torch
 
 class BatchDistributedSampler(DistributedSampler):
     """``EXPERIMENTAL`` Sampler that restricts to loading from a subset of the dataset.
@@ -36,6 +36,20 @@ def __init__(
         drop_dataset_tail: bool = False,
         **kwargs,
     ):
+
+        if not torch.distributed.is_initialized():
+            # initializes the distributed backend that takes care of synchronizing processes
+            torch.distributed.init_process_group(
+                "gloo",  # backend that works on all systems
+                init_method="file://"+kwargs["save_path"]+"/dist_file",
+                rank=0,
+                world_size=kwargs["num_processes"],
+            )
+
+        for redundant_key in ["save_path","pin_memory","num_processes","num_workers","persistent_workers"]:
+            if redundant_key in kwargs:
+                kwargs.pop(redundant_key)
+
         super().__init__(dataset, drop_last=drop_dataset_tail, **kwargs)
         self.batch_size = batch_size
         self.drop_last_batch = drop_last  # drop_last already defined in parent
diff --git a/tests/dataloaders/test_dataloaders.py b/tests/dataloaders/test_dataloaders.py
index d03d9ca9f1..c3fc1573f5 100644
--- a/tests/dataloaders/test_dataloaders.py
+++ b/tests/dataloaders/test_dataloaders.py
@@ -2,10 +2,10 @@
 import pytest
 import torch
 from tests.data.utils import generic_setup_adata_manager
-
+import os
 import scvi
 from scvi import REGISTRY_KEYS
-
+from scvi.model import SCANVI
 
 class TestSemiSupervisedTrainingPlan(scvi.train.SemiSupervisedTrainingPlan):
     def __init__(self, *args, **kwargs):
@@ -107,14 +107,52 @@ def multiprocessing_worker(
     return
 
 
-@pytest.mark.optional
-def test_anndataloader_distributed_sampler(save_path: str, num_processes: int = 2):
+#@pytest.mark.optional
+@pytest.mark.parametrize("num_processes", [1, 2])
+def test_anndataloader_distributed_sampler(num_processes: int, save_path: str):
     adata = scvi.data.synthetic_iid()
     manager = generic_setup_adata_manager(adata)
 
+    file_path = save_path + "/dist_file"
+    if os.path.exists(file_path):  # Check if the file exists
+        os.remove(file_path)
+
     torch.multiprocessing.spawn(
         multiprocessing_worker,
         args=(num_processes, manager, save_path),
         nprocs=num_processes,
         join=True,
     )
+
+#@pytest.mark.optional
+@pytest.mark.parametrize("num_processes", [1, 2])
+def test_scanvi_with_distributed_sampler(num_processes: int, save_path: str):
+    if torch.cuda.is_available():
+        adata = scvi.data.synthetic_iid()
+        manager = generic_setup_adata_manager(adata)
+        SCANVI.setup_anndata(
+            adata,
+            "labels",
+            "label_0",
+            batch_key="batch",
+        )
+        file_path = save_path + "/dist_file"
+        if os.path.exists(file_path):  # Check if the file exists
+            os.remove(file_path)
+        datasplitter_kwargs = {}
+        datasplitter_kwargs['distributed_sampler'] = True
+        if num_processes==1:
+            datasplitter_kwargs['distributed_sampler'] = False
+        datasplitter_kwargs['save_path'] = save_path
+        datasplitter_kwargs['num_processes'] = num_processes
+        model = SCANVI(adata, n_latent=10)
+
+        torch.multiprocessing.spawn(
+            multiprocessing_worker,
+            args=(num_processes, manager, save_path),
+            nprocs=num_processes,
+            join=True,
+        )
+
+        model.train(1, datasplitter_kwargs=datasplitter_kwargs)
+

From df60e2796f7046c372d6c6864bb0850f0cc005b0 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 25 Nov 2024 16:18:08 +0000
Subject: [PATCH 02/42] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/scvi/dataloaders/_ann_dataloader.py    |  4 ++--
 src/scvi/dataloaders/_concat_dataloader.py |  2 +-
 src/scvi/dataloaders/_samplers.py          | 14 ++++++++++----
 tests/dataloaders/test_dataloaders.py      | 21 ++++++++++++---------
 4 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/src/scvi/dataloaders/_ann_dataloader.py b/src/scvi/dataloaders/_ann_dataloader.py
index b978239569..2b63886ded 100644
--- a/src/scvi/dataloaders/_ann_dataloader.py
+++ b/src/scvi/dataloaders/_ann_dataloader.py
@@ -128,7 +128,7 @@ def __init__(
                     drop_last=drop_last,
                     drop_dataset_tail=drop_dataset_tail,
                     shuffle=shuffle,
-                    **kwargs
+                    **kwargs,
                 )
             # do not touch batch size here, sampler gives batched indices
             # This disables PyTorch automatic batching, which is necessary
@@ -140,7 +140,7 @@ def __init__(
         if iter_ndarray:
             self.kwargs.update({"collate_fn": lambda x: x})
 
-        for redundant_key in ["save_path","num_processes"]:
+        for redundant_key in ["save_path", "num_processes"]:
             if redundant_key in self.kwargs:
                 self.kwargs.pop(redundant_key)
         super().__init__(self.dataset, **self.kwargs)
diff --git a/src/scvi/dataloaders/_concat_dataloader.py b/src/scvi/dataloaders/_concat_dataloader.py
index 66802128fe..5b368aee23 100644
--- a/src/scvi/dataloaders/_concat_dataloader.py
+++ b/src/scvi/dataloaders/_concat_dataloader.py
@@ -70,7 +70,7 @@ def __init__(
             )
         lens = [len(dl) for dl in self.dataloaders]
         self.largest_dl = self.dataloaders[np.argmax(lens)]
-        for redundant_key in ["save_path","num_processes"]:
+        for redundant_key in ["save_path", "num_processes"]:
             if redundant_key in data_loader_kwargs:
                 data_loader_kwargs.pop(redundant_key)
         super().__init__(self.largest_dl, **data_loader_kwargs)
diff --git a/src/scvi/dataloaders/_samplers.py b/src/scvi/dataloaders/_samplers.py
index 7137bb7406..bd407d60f2 100644
--- a/src/scvi/dataloaders/_samplers.py
+++ b/src/scvi/dataloaders/_samplers.py
@@ -1,5 +1,6 @@
-from torch.utils.data import Dataset, DistributedSampler
 import torch
+from torch.utils.data import Dataset, DistributedSampler
+
 
 class BatchDistributedSampler(DistributedSampler):
     """``EXPERIMENTAL`` Sampler that restricts to loading from a subset of the dataset.
@@ -36,17 +37,22 @@ def __init__(
         drop_dataset_tail: bool = False,
         **kwargs,
     ):
-
         if not torch.distributed.is_initialized():
             # initializes the distributed backend that takes care of synchronizing processes
             torch.distributed.init_process_group(
                 "gloo",  # backend that works on all systems
-                init_method="file://"+kwargs["save_path"]+"/dist_file",
+                init_method="file://" + kwargs["save_path"] + "/dist_file",
                 rank=0,
                 world_size=kwargs["num_processes"],
             )
 
-        for redundant_key in ["save_path","pin_memory","num_processes","num_workers","persistent_workers"]:
+        for redundant_key in [
+            "save_path",
+            "pin_memory",
+            "num_processes",
+            "num_workers",
+            "persistent_workers",
+        ]:
             if redundant_key in kwargs:
                 kwargs.pop(redundant_key)
 
diff --git a/tests/dataloaders/test_dataloaders.py b/tests/dataloaders/test_dataloaders.py
index c3fc1573f5..2b825279f8 100644
--- a/tests/dataloaders/test_dataloaders.py
+++ b/tests/dataloaders/test_dataloaders.py
@@ -1,12 +1,15 @@
+import os
+
 import numpy as np
 import pytest
 import torch
 from tests.data.utils import generic_setup_adata_manager
-import os
+
 import scvi
 from scvi import REGISTRY_KEYS
 from scvi.model import SCANVI
 
+
 class TestSemiSupervisedTrainingPlan(scvi.train.SemiSupervisedTrainingPlan):
     def __init__(self, *args, **kwargs):
         self.n_samples_per_label = kwargs.pop("n_samples_per_label")
@@ -107,7 +110,7 @@ def multiprocessing_worker(
     return
 
 
-#@pytest.mark.optional
+# @pytest.mark.optional
 @pytest.mark.parametrize("num_processes", [1, 2])
 def test_anndataloader_distributed_sampler(num_processes: int, save_path: str):
     adata = scvi.data.synthetic_iid()
@@ -124,7 +127,8 @@ def test_anndataloader_distributed_sampler(num_processes: int, save_path: str):
         join=True,
     )
 
-#@pytest.mark.optional
+
+# @pytest.mark.optional
 @pytest.mark.parametrize("num_processes", [1, 2])
 def test_scanvi_with_distributed_sampler(num_processes: int, save_path: str):
     if torch.cuda.is_available():
@@ -140,11 +144,11 @@ def test_scanvi_with_distributed_sampler(num_processes: int, save_path: str):
         if os.path.exists(file_path):  # Check if the file exists
             os.remove(file_path)
         datasplitter_kwargs = {}
-        datasplitter_kwargs['distributed_sampler'] = True
-        if num_processes==1:
-            datasplitter_kwargs['distributed_sampler'] = False
-        datasplitter_kwargs['save_path'] = save_path
-        datasplitter_kwargs['num_processes'] = num_processes
+        datasplitter_kwargs["distributed_sampler"] = True
+        if num_processes == 1:
+            datasplitter_kwargs["distributed_sampler"] = False
+        datasplitter_kwargs["save_path"] = save_path
+        datasplitter_kwargs["num_processes"] = num_processes
         model = SCANVI(adata, n_latent=10)
 
         torch.multiprocessing.spawn(
@@ -155,4 +159,3 @@ def test_scanvi_with_distributed_sampler(num_processes: int, save_path: str):
         )
 
         model.train(1, datasplitter_kwargs=datasplitter_kwargs)
-

From 353d4449dd784c91981d2e3dda8d8f43125ec32b Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 28 Nov 2024 09:08:56 +0000
Subject: [PATCH 03/42] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 228cad01e2..058dd5d37b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -30,7 +30,7 @@ to [Semantic Versioning]. Full commit history is available in the
 - Implemented variance of ZINB distribution. {pr}`3044`.
 - Add {class}`scvi.external.METHYLVI` for modeling methylation data from single-cell
     bisulfite sequencing (scBS-seq) experiments {pr}`2834`.
-  
+
 #### Fixed
 
 - Breaking Change: Fix `get_outlier_cell_sample_pairs` function in {class}`scvi.external.MRVI`

From 1e17d7f0ae892455ff74c6769ecb393376ccb65f Mon Sep 17 00:00:00 2001
From: Ori Kronfeld <ori.kronfeld@weizmann.ac.il>
Date: Thu, 28 Nov 2024 12:56:49 +0200
Subject: [PATCH 04/42] fix test_samplers

---
 tests/dataloaders/test_dataloaders.py | 54 +++++++++++++--------------
 tests/dataloaders/test_samplers.py    | 28 ++++++++++++++
 2 files changed, 55 insertions(+), 27 deletions(-)

diff --git a/tests/dataloaders/test_dataloaders.py b/tests/dataloaders/test_dataloaders.py
index 2b825279f8..3a06ac5af5 100644
--- a/tests/dataloaders/test_dataloaders.py
+++ b/tests/dataloaders/test_dataloaders.py
@@ -131,31 +131,31 @@ def test_anndataloader_distributed_sampler(num_processes: int, save_path: str):
 # @pytest.mark.optional
 @pytest.mark.parametrize("num_processes", [1, 2])
 def test_scanvi_with_distributed_sampler(num_processes: int, save_path: str):
-    if torch.cuda.is_available():
-        adata = scvi.data.synthetic_iid()
-        manager = generic_setup_adata_manager(adata)
-        SCANVI.setup_anndata(
-            adata,
-            "labels",
-            "label_0",
-            batch_key="batch",
-        )
-        file_path = save_path + "/dist_file"
-        if os.path.exists(file_path):  # Check if the file exists
-            os.remove(file_path)
-        datasplitter_kwargs = {}
-        datasplitter_kwargs["distributed_sampler"] = True
-        if num_processes == 1:
-            datasplitter_kwargs["distributed_sampler"] = False
-        datasplitter_kwargs["save_path"] = save_path
-        datasplitter_kwargs["num_processes"] = num_processes
-        model = SCANVI(adata, n_latent=10)
-
-        torch.multiprocessing.spawn(
-            multiprocessing_worker,
-            args=(num_processes, manager, save_path),
-            nprocs=num_processes,
-            join=True,
-        )
+    #if torch.cuda.is_available():
+    adata = scvi.data.synthetic_iid()
+    manager = generic_setup_adata_manager(adata)
+    SCANVI.setup_anndata(
+        adata,
+        "labels",
+        "label_0",
+        batch_key="batch",
+    )
+    file_path = save_path + "/dist_file"
+    if os.path.exists(file_path):  # Check if the file exists
+        os.remove(file_path)
+    datasplitter_kwargs = {}
+    datasplitter_kwargs["distributed_sampler"] = True
+    if num_processes == 1:
+        datasplitter_kwargs["distributed_sampler"] = False
+    datasplitter_kwargs["save_path"] = save_path
+    datasplitter_kwargs["num_processes"] = num_processes
+    model = SCANVI(adata, n_latent=10)
+
+    torch.multiprocessing.spawn(
+        multiprocessing_worker,
+        args=(num_processes, manager, save_path),
+        nprocs=num_processes,
+        join=True,
+    )
 
-        model.train(1, datasplitter_kwargs=datasplitter_kwargs)
+    model.train(1, datasplitter_kwargs=datasplitter_kwargs)
diff --git a/tests/dataloaders/test_samplers.py b/tests/dataloaders/test_samplers.py
index ae4861d98f..58d04ed67f 100644
--- a/tests/dataloaders/test_samplers.py
+++ b/tests/dataloaders/test_samplers.py
@@ -1,6 +1,7 @@
 from math import ceil, floor
 
 import numpy as np
+import os
 import pytest
 from tests.data.utils import generic_setup_adata_manager
 
@@ -8,7 +9,10 @@
 from scvi.dataloaders import BatchDistributedSampler
 
 
+@pytest.mark.parametrize("num_processes", [1, 2])
 def test_batchdistributedsampler_init(
+    num_processes: int,
+    save_path: str,
     batch_size: int = 128,
     n_batches: int = 2,
 ):
@@ -16,6 +20,10 @@ def test_batchdistributedsampler_init(
     manager = generic_setup_adata_manager(adata)
     dataset = manager.create_torch_dataset()
 
+    file_path = save_path + "/dist_file"
+    if os.path.exists(file_path):  # Check if the file exists
+        os.remove(file_path)
+
     sampler = BatchDistributedSampler(
         dataset,
         num_replicas=1,
@@ -24,6 +32,8 @@ def test_batchdistributedsampler_init(
         shuffle=True,
         drop_last=True,
         drop_dataset_tail=True,
+        num_processes=num_processes,
+        save_path=save_path
     )
     assert sampler.batch_size == batch_size
     assert sampler.rank == 0
@@ -35,9 +45,12 @@ def test_batchdistributedsampler_init(
 
 @pytest.mark.parametrize("drop_last", [True, False])
 @pytest.mark.parametrize("drop_dataset_tail", [True, False])
+@pytest.mark.parametrize("num_processes", [1, 2])
 def test_batchdistributedsampler_drop_last(
+    num_processes: int,
     drop_last: bool,
     drop_dataset_tail: bool,
+    save_path: str,
     batch_size: int = 128,
     n_batches: int = 3,
     num_replicas: int = 2,
@@ -101,6 +114,10 @@ def check_samplers(samplers: list, sampler_batch_size: int):
             assert len(all_indices) == effective_n_obs_per_sampler
             assert [len(indices) for indices in batch_indices] == batch_sizes
 
+    file_path = save_path + "/dist_file"
+    if os.path.exists(file_path):  # Check if the file exists
+        os.remove(file_path)
+
     for sampler_batch_size in [batch_size, batch_size - 1, batch_size + 1]:
         samplers = [
             BatchDistributedSampler(
@@ -110,13 +127,18 @@ def check_samplers(samplers: list, sampler_batch_size: int):
                 batch_size=sampler_batch_size,
                 drop_last=drop_last,
                 drop_dataset_tail=drop_dataset_tail,
+                num_processes=num_processes,
+                save_path=save_path
             )
             for i in range(num_replicas)
         ]
         check_samplers(samplers, sampler_batch_size)
 
 
+@pytest.mark.parametrize("num_processes", [1, 2])
 def test_batchdistributedsampler_indices(
+    num_processes: int,
+    save_path: str,
     batch_size: int = 128,
     n_batches: int = 3,
     num_replicas: int = 2,
@@ -125,12 +147,18 @@ def test_batchdistributedsampler_indices(
     manager = generic_setup_adata_manager(adata)
     dataset = manager.create_torch_dataset()
 
+    file_path = save_path + "/dist_file"
+    if os.path.exists(file_path):  # Check if the file exists
+        os.remove(file_path)
+
     samplers = [
         BatchDistributedSampler(
             dataset,
             num_replicas=num_replicas,
             rank=i,
             batch_size=batch_size,
+            num_processes=num_processes,
+            save_path=save_path
         )
         for i in range(num_replicas)
     ]

From d63337ab0852d0429fbd03d8b030cfd607063810 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 28 Nov 2024 10:57:05 +0000
Subject: [PATCH 05/42] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/dataloaders/test_dataloaders.py | 2 +-
 tests/dataloaders/test_samplers.py    | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/dataloaders/test_dataloaders.py b/tests/dataloaders/test_dataloaders.py
index 3a06ac5af5..82e07b1b26 100644
--- a/tests/dataloaders/test_dataloaders.py
+++ b/tests/dataloaders/test_dataloaders.py
@@ -131,7 +131,7 @@ def test_anndataloader_distributed_sampler(num_processes: int, save_path: str):
 # @pytest.mark.optional
 @pytest.mark.parametrize("num_processes", [1, 2])
 def test_scanvi_with_distributed_sampler(num_processes: int, save_path: str):
-    #if torch.cuda.is_available():
+    # if torch.cuda.is_available():
     adata = scvi.data.synthetic_iid()
     manager = generic_setup_adata_manager(adata)
     SCANVI.setup_anndata(
diff --git a/tests/dataloaders/test_samplers.py b/tests/dataloaders/test_samplers.py
index 58d04ed67f..09a8a5a127 100644
--- a/tests/dataloaders/test_samplers.py
+++ b/tests/dataloaders/test_samplers.py
@@ -1,7 +1,7 @@
+import os
 from math import ceil, floor
 
 import numpy as np
-import os
 import pytest
 from tests.data.utils import generic_setup_adata_manager
 
@@ -33,7 +33,7 @@ def test_batchdistributedsampler_init(
         drop_last=True,
         drop_dataset_tail=True,
         num_processes=num_processes,
-        save_path=save_path
+        save_path=save_path,
     )
     assert sampler.batch_size == batch_size
     assert sampler.rank == 0
@@ -128,7 +128,7 @@ def check_samplers(samplers: list, sampler_batch_size: int):
                 drop_last=drop_last,
                 drop_dataset_tail=drop_dataset_tail,
                 num_processes=num_processes,
-                save_path=save_path
+                save_path=save_path,
             )
             for i in range(num_replicas)
         ]
@@ -158,7 +158,7 @@ def test_batchdistributedsampler_indices(
             rank=i,
             batch_size=batch_size,
             num_processes=num_processes,
-            save_path=save_path
+            save_path=save_path,
         )
         for i in range(num_replicas)
     ]

From ca0daf0345b10f6630eefd33abb261e1b9d97f7e Mon Sep 17 00:00:00 2001
From: Ori Kronfeld <ori.kronfeld@weizmann.ac.il>
Date: Thu, 28 Nov 2024 13:11:40 +0200
Subject: [PATCH 06/42] fix test_samplers

---
 tests/dataloaders/test_dataloaders.py | 54 +++++++++++++--------------
 1 file changed, 27 insertions(+), 27 deletions(-)

diff --git a/tests/dataloaders/test_dataloaders.py b/tests/dataloaders/test_dataloaders.py
index 3a06ac5af5..2b825279f8 100644
--- a/tests/dataloaders/test_dataloaders.py
+++ b/tests/dataloaders/test_dataloaders.py
@@ -131,31 +131,31 @@ def test_anndataloader_distributed_sampler(num_processes: int, save_path: str):
 # @pytest.mark.optional
 @pytest.mark.parametrize("num_processes", [1, 2])
 def test_scanvi_with_distributed_sampler(num_processes: int, save_path: str):
-    #if torch.cuda.is_available():
-    adata = scvi.data.synthetic_iid()
-    manager = generic_setup_adata_manager(adata)
-    SCANVI.setup_anndata(
-        adata,
-        "labels",
-        "label_0",
-        batch_key="batch",
-    )
-    file_path = save_path + "/dist_file"
-    if os.path.exists(file_path):  # Check if the file exists
-        os.remove(file_path)
-    datasplitter_kwargs = {}
-    datasplitter_kwargs["distributed_sampler"] = True
-    if num_processes == 1:
-        datasplitter_kwargs["distributed_sampler"] = False
-    datasplitter_kwargs["save_path"] = save_path
-    datasplitter_kwargs["num_processes"] = num_processes
-    model = SCANVI(adata, n_latent=10)
-
-    torch.multiprocessing.spawn(
-        multiprocessing_worker,
-        args=(num_processes, manager, save_path),
-        nprocs=num_processes,
-        join=True,
-    )
+    if torch.cuda.is_available():
+        adata = scvi.data.synthetic_iid()
+        manager = generic_setup_adata_manager(adata)
+        SCANVI.setup_anndata(
+            adata,
+            "labels",
+            "label_0",
+            batch_key="batch",
+        )
+        file_path = save_path + "/dist_file"
+        if os.path.exists(file_path):  # Check if the file exists
+            os.remove(file_path)
+        datasplitter_kwargs = {}
+        datasplitter_kwargs["distributed_sampler"] = True
+        if num_processes == 1:
+            datasplitter_kwargs["distributed_sampler"] = False
+        datasplitter_kwargs["save_path"] = save_path
+        datasplitter_kwargs["num_processes"] = num_processes
+        model = SCANVI(adata, n_latent=10)
+
+        torch.multiprocessing.spawn(
+            multiprocessing_worker,
+            args=(num_processes, manager, save_path),
+            nprocs=num_processes,
+            join=True,
+        )
 
-    model.train(1, datasplitter_kwargs=datasplitter_kwargs)
+        model.train(1, datasplitter_kwargs=datasplitter_kwargs)

From fe99eabd70a160f2013a28c8aa81ddb8978c54aa Mon Sep 17 00:00:00 2001
From: Ori Kronfeld <ori.kronfeld@weizmann.ac.il>
Date: Thu, 28 Nov 2024 16:03:35 +0200
Subject: [PATCH 07/42] Changed to nvidia nccl and fix drop last batch
 parameter for evenly batches between gpus

---
 src/scvi/dataloaders/_ann_dataloader.py    | 10 +++++-----
 src/scvi/dataloaders/_concat_dataloader.py |  6 ++++--
 src/scvi/dataloaders/_samplers.py          |  3 ++-
 src/scvi/nn/_base_components.py            |  3 ++-
 tests/dataloaders/test_dataloaders.py      | 21 +++++++++++++--------
 tests/dataloaders/test_samplers.py         |  8 ++++----
 6 files changed, 30 insertions(+), 21 deletions(-)

diff --git a/src/scvi/dataloaders/_ann_dataloader.py b/src/scvi/dataloaders/_ann_dataloader.py
index 2b63886ded..ee3336ce09 100644
--- a/src/scvi/dataloaders/_ann_dataloader.py
+++ b/src/scvi/dataloaders/_ann_dataloader.py
@@ -117,6 +117,10 @@ def __init__(
                     batch_size=batch_size,
                     drop_last=drop_last,
                 )
+                # do not touch batch size here, sampler gives batched indices
+                # This disables PyTorch automatic batching, which is necessary
+                # for fast access to sparse matrices
+                self.kwargs.update({"batch_size": None, "shuffle": False})
             else:
                 if "save_path" not in kwargs:
                     kwargs["save_path"] = "/."
@@ -130,10 +134,6 @@ def __init__(
                     shuffle=shuffle,
                     **kwargs,
                 )
-            # do not touch batch size here, sampler gives batched indices
-            # This disables PyTorch automatic batching, which is necessary
-            # for fast access to sparse matrices
-            self.kwargs.update({"batch_size": None, "shuffle": False})
 
         self.kwargs.update({"sampler": sampler})
 
@@ -143,4 +143,4 @@ def __init__(
         for redundant_key in ["save_path", "num_processes"]:
             if redundant_key in self.kwargs:
                 self.kwargs.pop(redundant_key)
-        super().__init__(self.dataset, **self.kwargs)
+        super().__init__(self.dataset, drop_last=drop_dataset_tail, **self.kwargs)
diff --git a/src/scvi/dataloaders/_concat_dataloader.py b/src/scvi/dataloaders/_concat_dataloader.py
index 5b368aee23..a1c4616f56 100644
--- a/src/scvi/dataloaders/_concat_dataloader.py
+++ b/src/scvi/dataloaders/_concat_dataloader.py
@@ -53,6 +53,8 @@ def __init__(
         self._shuffle = shuffle
         self._batch_size = batch_size
         self._drop_last = drop_last
+        self._drop_dataset_tail = self.dataloader_kwargs["drop_dataset_tail"] \
+            if "drop_dataset_tail" in self.dataloader_kwargs.keys() else False
 
         self.dataloaders = []
         for indices in indices_list:
@@ -70,10 +72,10 @@ def __init__(
             )
         lens = [len(dl) for dl in self.dataloaders]
         self.largest_dl = self.dataloaders[np.argmax(lens)]
-        for redundant_key in ["save_path", "num_processes"]:
+        for redundant_key in ["save_path", "num_processes","drop_dataset_tail"]:
             if redundant_key in data_loader_kwargs:
                 data_loader_kwargs.pop(redundant_key)
-        super().__init__(self.largest_dl, **data_loader_kwargs)
+        super().__init__(self.largest_dl, drop_last=self._drop_dataset_tail, **data_loader_kwargs)
 
     def __len__(self):
         return len(self.largest_dl)
diff --git a/src/scvi/dataloaders/_samplers.py b/src/scvi/dataloaders/_samplers.py
index bd407d60f2..3e62bc9b3c 100644
--- a/src/scvi/dataloaders/_samplers.py
+++ b/src/scvi/dataloaders/_samplers.py
@@ -40,10 +40,11 @@ def __init__(
         if not torch.distributed.is_initialized():
             # initializes the distributed backend that takes care of synchronizing processes
             torch.distributed.init_process_group(
-                "gloo",  # backend that works on all systems
+                "nccl",  # backend that works on all systems
                 init_method="file://" + kwargs["save_path"] + "/dist_file",
                 rank=0,
                 world_size=kwargs["num_processes"],
+                store=None
             )
 
         for redundant_key in [
diff --git a/src/scvi/nn/_base_components.py b/src/scvi/nn/_base_components.py
index 59b537a321..f632aff18d 100644
--- a/src/scvi/nn/_base_components.py
+++ b/src/scvi/nn/_base_components.py
@@ -175,7 +175,8 @@ def forward(self, x: torch.Tensor, *cat_list: int):
                         if isinstance(layer, nn.Linear) and self.inject_into_layer(i):
                             if x.dim() == 3:
                                 one_hot_cat_list_layer = [
-                                    o.unsqueeze(0).expand((x.size(0), o.size(0), o.size(1)))
+                                    o.unsqueeze(0).expand((x.size(0), o.size(0), o.size(1))) if o.dim()==2
+                                    else o[0].unsqueeze(0).expand((x.size(1), o.size(1), o.size(2)))
                                     for o in one_hot_cat_list
                                 ]
                             else:
diff --git a/tests/dataloaders/test_dataloaders.py b/tests/dataloaders/test_dataloaders.py
index 2b825279f8..b3b22abda2 100644
--- a/tests/dataloaders/test_dataloaders.py
+++ b/tests/dataloaders/test_dataloaders.py
@@ -95,17 +95,18 @@ def test_anndataloader_distributed_sampler_init():
 
 
 def multiprocessing_worker(
-    rank: int, world_size: int, manager: scvi.data.AnnDataManager, save_path: str
+    rank: int, world_size: int, manager: scvi.data.AnnDataManager, save_path: str, datasplitter_kwargs
 ):
     # initializes the distributed backend that takes care of synchronizing processes
     torch.distributed.init_process_group(
-        "gloo",  # backend that works on all systems
+        "nccl",  # backend that works on all systems
         init_method=f"file://{save_path}/dist_file",
         rank=rank,
         world_size=world_size,
+        store = None
     )
 
-    _ = scvi.dataloaders.AnnDataLoader(manager, distributed_sampler=True)
+    _ = scvi.dataloaders.AnnDataLoader(manager, **datasplitter_kwargs)
 
     return
 
@@ -122,14 +123,14 @@ def test_anndataloader_distributed_sampler(num_processes: int, save_path: str):
 
     torch.multiprocessing.spawn(
         multiprocessing_worker,
-        args=(num_processes, manager, save_path),
+        args=(num_processes, manager, save_path, {}),
         nprocs=num_processes,
         join=True,
     )
 
 
 # @pytest.mark.optional
-@pytest.mark.parametrize("num_processes", [1, 2])
+@pytest.mark.parametrize("num_processes", [2])
 def test_scanvi_with_distributed_sampler(num_processes: int, save_path: str):
     if torch.cuda.is_available():
         adata = scvi.data.synthetic_iid()
@@ -144,16 +145,20 @@ def test_scanvi_with_distributed_sampler(num_processes: int, save_path: str):
         if os.path.exists(file_path):  # Check if the file exists
             os.remove(file_path)
         datasplitter_kwargs = {}
+        #Multi-GPU settings
         datasplitter_kwargs["distributed_sampler"] = True
-        if num_processes == 1:
-            datasplitter_kwargs["distributed_sampler"] = False
         datasplitter_kwargs["save_path"] = save_path
         datasplitter_kwargs["num_processes"] = num_processes
+        datasplitter_kwargs["drop_dataset_tail"] = True
+        datasplitter_kwargs["drop_last"] = False
+        if num_processes == 1:
+            datasplitter_kwargs["distributed_sampler"] = False
+            datasplitter_kwargs["drop_dataset_tail"] = False
         model = SCANVI(adata, n_latent=10)
 
         torch.multiprocessing.spawn(
             multiprocessing_worker,
-            args=(num_processes, manager, save_path),
+            args=(num_processes, manager, save_path, {}),
             nprocs=num_processes,
             join=True,
         )
diff --git a/tests/dataloaders/test_samplers.py b/tests/dataloaders/test_samplers.py
index 09a8a5a127..58d04ed67f 100644
--- a/tests/dataloaders/test_samplers.py
+++ b/tests/dataloaders/test_samplers.py
@@ -1,7 +1,7 @@
-import os
 from math import ceil, floor
 
 import numpy as np
+import os
 import pytest
 from tests.data.utils import generic_setup_adata_manager
 
@@ -33,7 +33,7 @@ def test_batchdistributedsampler_init(
         drop_last=True,
         drop_dataset_tail=True,
         num_processes=num_processes,
-        save_path=save_path,
+        save_path=save_path
     )
     assert sampler.batch_size == batch_size
     assert sampler.rank == 0
@@ -128,7 +128,7 @@ def check_samplers(samplers: list, sampler_batch_size: int):
                 drop_last=drop_last,
                 drop_dataset_tail=drop_dataset_tail,
                 num_processes=num_processes,
-                save_path=save_path,
+                save_path=save_path
             )
             for i in range(num_replicas)
         ]
@@ -158,7 +158,7 @@ def test_batchdistributedsampler_indices(
             rank=i,
             batch_size=batch_size,
             num_processes=num_processes,
-            save_path=save_path,
+            save_path=save_path
         )
         for i in range(num_replicas)
     ]

From 1e2386bf46f3ba54f425fb2930748f3e91381a8c Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 28 Nov 2024 14:04:59 +0000
Subject: [PATCH 08/42] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/scvi/dataloaders/_concat_dataloader.py |  9 ++++++---
 src/scvi/dataloaders/_samplers.py          |  2 +-
 src/scvi/nn/_base_components.py            |  7 +++++--
 tests/dataloaders/test_dataloaders.py      | 10 +++++++---
 tests/dataloaders/test_samplers.py         |  8 ++++----
 5 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/src/scvi/dataloaders/_concat_dataloader.py b/src/scvi/dataloaders/_concat_dataloader.py
index a1c4616f56..9d35e2fb47 100644
--- a/src/scvi/dataloaders/_concat_dataloader.py
+++ b/src/scvi/dataloaders/_concat_dataloader.py
@@ -53,8 +53,11 @@ def __init__(
         self._shuffle = shuffle
         self._batch_size = batch_size
         self._drop_last = drop_last
-        self._drop_dataset_tail = self.dataloader_kwargs["drop_dataset_tail"] \
-            if "drop_dataset_tail" in self.dataloader_kwargs.keys() else False
+        self._drop_dataset_tail = (
+            self.dataloader_kwargs["drop_dataset_tail"]
+            if "drop_dataset_tail" in self.dataloader_kwargs.keys()
+            else False
+        )
 
         self.dataloaders = []
         for indices in indices_list:
@@ -72,7 +75,7 @@ def __init__(
             )
         lens = [len(dl) for dl in self.dataloaders]
         self.largest_dl = self.dataloaders[np.argmax(lens)]
-        for redundant_key in ["save_path", "num_processes","drop_dataset_tail"]:
+        for redundant_key in ["save_path", "num_processes", "drop_dataset_tail"]:
             if redundant_key in data_loader_kwargs:
                 data_loader_kwargs.pop(redundant_key)
         super().__init__(self.largest_dl, drop_last=self._drop_dataset_tail, **data_loader_kwargs)
diff --git a/src/scvi/dataloaders/_samplers.py b/src/scvi/dataloaders/_samplers.py
index 3e62bc9b3c..0d7b246940 100644
--- a/src/scvi/dataloaders/_samplers.py
+++ b/src/scvi/dataloaders/_samplers.py
@@ -44,7 +44,7 @@ def __init__(
                 init_method="file://" + kwargs["save_path"] + "/dist_file",
                 rank=0,
                 world_size=kwargs["num_processes"],
-                store=None
+                store=None,
             )
 
         for redundant_key in [
diff --git a/src/scvi/nn/_base_components.py b/src/scvi/nn/_base_components.py
index f632aff18d..75cf438da5 100644
--- a/src/scvi/nn/_base_components.py
+++ b/src/scvi/nn/_base_components.py
@@ -175,8 +175,11 @@ def forward(self, x: torch.Tensor, *cat_list: int):
                         if isinstance(layer, nn.Linear) and self.inject_into_layer(i):
                             if x.dim() == 3:
                                 one_hot_cat_list_layer = [
-                                    o.unsqueeze(0).expand((x.size(0), o.size(0), o.size(1))) if o.dim()==2
-                                    else o[0].unsqueeze(0).expand((x.size(1), o.size(1), o.size(2)))
+                                    o.unsqueeze(0).expand((x.size(0), o.size(0), o.size(1)))
+                                    if o.dim() == 2
+                                    else o[0]
+                                    .unsqueeze(0)
+                                    .expand((x.size(1), o.size(1), o.size(2)))
                                     for o in one_hot_cat_list
                                 ]
                             else:
diff --git a/tests/dataloaders/test_dataloaders.py b/tests/dataloaders/test_dataloaders.py
index b3b22abda2..95925e0992 100644
--- a/tests/dataloaders/test_dataloaders.py
+++ b/tests/dataloaders/test_dataloaders.py
@@ -95,7 +95,11 @@ def test_anndataloader_distributed_sampler_init():
 
 
 def multiprocessing_worker(
-    rank: int, world_size: int, manager: scvi.data.AnnDataManager, save_path: str, datasplitter_kwargs
+    rank: int,
+    world_size: int,
+    manager: scvi.data.AnnDataManager,
+    save_path: str,
+    datasplitter_kwargs,
 ):
     # initializes the distributed backend that takes care of synchronizing processes
     torch.distributed.init_process_group(
@@ -103,7 +107,7 @@ def multiprocessing_worker(
         init_method=f"file://{save_path}/dist_file",
         rank=rank,
         world_size=world_size,
-        store = None
+        store=None,
     )
 
     _ = scvi.dataloaders.AnnDataLoader(manager, **datasplitter_kwargs)
@@ -145,7 +149,7 @@ def test_scanvi_with_distributed_sampler(num_processes: int, save_path: str):
         if os.path.exists(file_path):  # Check if the file exists
             os.remove(file_path)
         datasplitter_kwargs = {}
-        #Multi-GPU settings
+        # Multi-GPU settings
         datasplitter_kwargs["distributed_sampler"] = True
         datasplitter_kwargs["save_path"] = save_path
         datasplitter_kwargs["num_processes"] = num_processes
diff --git a/tests/dataloaders/test_samplers.py b/tests/dataloaders/test_samplers.py
index 58d04ed67f..09a8a5a127 100644
--- a/tests/dataloaders/test_samplers.py
+++ b/tests/dataloaders/test_samplers.py
@@ -1,7 +1,7 @@
+import os
 from math import ceil, floor
 
 import numpy as np
-import os
 import pytest
 from tests.data.utils import generic_setup_adata_manager
 
@@ -33,7 +33,7 @@ def test_batchdistributedsampler_init(
         drop_last=True,
         drop_dataset_tail=True,
         num_processes=num_processes,
-        save_path=save_path
+        save_path=save_path,
     )
     assert sampler.batch_size == batch_size
     assert sampler.rank == 0
@@ -128,7 +128,7 @@ def check_samplers(samplers: list, sampler_batch_size: int):
                 drop_last=drop_last,
                 drop_dataset_tail=drop_dataset_tail,
                 num_processes=num_processes,
-                save_path=save_path
+                save_path=save_path,
             )
             for i in range(num_replicas)
         ]
@@ -158,7 +158,7 @@ def test_batchdistributedsampler_indices(
             rank=i,
             batch_size=batch_size,
             num_processes=num_processes,
-            save_path=save_path
+            save_path=save_path,
         )
         for i in range(num_replicas)
     ]

From 2e3519a8ddd3d50f7d04244451a1eed69aba9414 Mon Sep 17 00:00:00 2001
From: ori-kron-wis <ori.kronfeld@weizmann.ac.il>
Date: Thu, 28 Nov 2024 18:27:06 +0200
Subject: [PATCH 09/42] revert base_component

---
 src/scvi/nn/_base_components.py       | 4 ----
 tests/dataloaders/test_dataloaders.py | 2 +-
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/scvi/nn/_base_components.py b/src/scvi/nn/_base_components.py
index 75cf438da5..59b537a321 100644
--- a/src/scvi/nn/_base_components.py
+++ b/src/scvi/nn/_base_components.py
@@ -176,10 +176,6 @@ def forward(self, x: torch.Tensor, *cat_list: int):
                             if x.dim() == 3:
                                 one_hot_cat_list_layer = [
                                     o.unsqueeze(0).expand((x.size(0), o.size(0), o.size(1)))
-                                    if o.dim() == 2
-                                    else o[0]
-                                    .unsqueeze(0)
-                                    .expand((x.size(1), o.size(1), o.size(2)))
                                     for o in one_hot_cat_list
                                 ]
                             else:
diff --git a/tests/dataloaders/test_dataloaders.py b/tests/dataloaders/test_dataloaders.py
index 95925e0992..880fdc3567 100644
--- a/tests/dataloaders/test_dataloaders.py
+++ b/tests/dataloaders/test_dataloaders.py
@@ -134,7 +134,7 @@ def test_anndataloader_distributed_sampler(num_processes: int, save_path: str):
 
 
 # @pytest.mark.optional
-@pytest.mark.parametrize("num_processes", [2])
+@pytest.mark.parametrize("num_processes", [1,2])
 def test_scanvi_with_distributed_sampler(num_processes: int, save_path: str):
     if torch.cuda.is_available():
         adata = scvi.data.synthetic_iid()

From 22c074837f4ba5f0b295c0e4095dd1e09523c57c Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 28 Nov 2024 16:27:21 +0000
Subject: [PATCH 10/42] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/dataloaders/test_dataloaders.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/dataloaders/test_dataloaders.py b/tests/dataloaders/test_dataloaders.py
index 880fdc3567..02813ef0de 100644
--- a/tests/dataloaders/test_dataloaders.py
+++ b/tests/dataloaders/test_dataloaders.py
@@ -134,7 +134,7 @@ def test_anndataloader_distributed_sampler(num_processes: int, save_path: str):
 
 
 # @pytest.mark.optional
-@pytest.mark.parametrize("num_processes", [1,2])
+@pytest.mark.parametrize("num_processes", [1, 2])
 def test_scanvi_with_distributed_sampler(num_processes: int, save_path: str):
     if torch.cuda.is_available():
         adata = scvi.data.synthetic_iid()

From dabd350e1e973bef77ec5d66f0d2125ffd552462 Mon Sep 17 00:00:00 2001
From: ori-kron-wis <ori.kronfeld@weizmann.ac.il>
Date: Thu, 28 Nov 2024 18:35:55 +0200
Subject: [PATCH 11/42] make cpu tests ok

---
 src/scvi/dataloaders/_samplers.py     |  2 +-
 tests/dataloaders/test_dataloaders.py | 25 ++++++++++++-------------
 2 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/src/scvi/dataloaders/_samplers.py b/src/scvi/dataloaders/_samplers.py
index 0d7b246940..0392384684 100644
--- a/src/scvi/dataloaders/_samplers.py
+++ b/src/scvi/dataloaders/_samplers.py
@@ -37,7 +37,7 @@ def __init__(
         drop_dataset_tail: bool = False,
         **kwargs,
     ):
-        if not torch.distributed.is_initialized():
+        if not torch.distributed.is_initialized() and torch.cuda.is_available():
             # initializes the distributed backend that takes care of synchronizing processes
             torch.distributed.init_process_group(
                 "nccl",  # backend that works on all systems
diff --git a/tests/dataloaders/test_dataloaders.py b/tests/dataloaders/test_dataloaders.py
index 02813ef0de..2000ec1f25 100644
--- a/tests/dataloaders/test_dataloaders.py
+++ b/tests/dataloaders/test_dataloaders.py
@@ -115,25 +115,24 @@ def multiprocessing_worker(
     return
 
 
-# @pytest.mark.optional
 @pytest.mark.parametrize("num_processes", [1, 2])
 def test_anndataloader_distributed_sampler(num_processes: int, save_path: str):
-    adata = scvi.data.synthetic_iid()
-    manager = generic_setup_adata_manager(adata)
+    if torch.cuda.is_available():
+        adata = scvi.data.synthetic_iid()
+        manager = generic_setup_adata_manager(adata)
 
-    file_path = save_path + "/dist_file"
-    if os.path.exists(file_path):  # Check if the file exists
-        os.remove(file_path)
+        file_path = save_path + "/dist_file"
+        if os.path.exists(file_path):  # Check if the file exists
+            os.remove(file_path)
 
-    torch.multiprocessing.spawn(
-        multiprocessing_worker,
-        args=(num_processes, manager, save_path, {}),
-        nprocs=num_processes,
-        join=True,
-    )
+        torch.multiprocessing.spawn(
+            multiprocessing_worker,
+            args=(num_processes, manager, save_path, {}),
+            nprocs=num_processes,
+            join=True,
+        )
 
 
-# @pytest.mark.optional
 @pytest.mark.parametrize("num_processes", [1, 2])
 def test_scanvi_with_distributed_sampler(num_processes: int, save_path: str):
     if torch.cuda.is_available():

From ce7f3318e210675707d86d4761a3f4305fb3a98f Mon Sep 17 00:00:00 2001
From: Ori Kronfeld <ori.kronfeld@weizmann.ac.il>
Date: Mon, 2 Dec 2024 15:52:29 +0200
Subject: [PATCH 12/42] Added running multiGPU tests for scvi,scanvi + revert
 some code fixes

---
 src/scvi/dataloaders/_ann_dataloader.py    |  7 ---
 src/scvi/dataloaders/_concat_dataloader.py |  3 -
 src/scvi/dataloaders/_samplers.py          | 11 ----
 tests/dataloaders/test_dataloaders.py      | 42 ++------------
 tests/dataloaders/test_samplers.py         | 28 ---------
 tests/model/test_scanvi.py                 | 66 ++++++++++++++++++++++
 tests/model/test_scvi.py                   | 59 +++++++++++++++++++
 7 files changed, 130 insertions(+), 86 deletions(-)

diff --git a/src/scvi/dataloaders/_ann_dataloader.py b/src/scvi/dataloaders/_ann_dataloader.py
index ee3336ce09..82a13ac0b6 100644
--- a/src/scvi/dataloaders/_ann_dataloader.py
+++ b/src/scvi/dataloaders/_ann_dataloader.py
@@ -122,10 +122,6 @@ def __init__(
                 # for fast access to sparse matrices
                 self.kwargs.update({"batch_size": None, "shuffle": False})
             else:
-                if "save_path" not in kwargs:
-                    kwargs["save_path"] = "/."
-                if "num_processes" not in kwargs:
-                    kwargs["num_processes"] = 1
                 sampler = BatchDistributedSampler(
                     self.dataset,
                     batch_size=batch_size,
@@ -140,7 +136,4 @@ def __init__(
         if iter_ndarray:
             self.kwargs.update({"collate_fn": lambda x: x})
 
-        for redundant_key in ["save_path", "num_processes"]:
-            if redundant_key in self.kwargs:
-                self.kwargs.pop(redundant_key)
         super().__init__(self.dataset, drop_last=drop_dataset_tail, **self.kwargs)
diff --git a/src/scvi/dataloaders/_concat_dataloader.py b/src/scvi/dataloaders/_concat_dataloader.py
index 9d35e2fb47..554f0267f0 100644
--- a/src/scvi/dataloaders/_concat_dataloader.py
+++ b/src/scvi/dataloaders/_concat_dataloader.py
@@ -75,9 +75,6 @@ def __init__(
             )
         lens = [len(dl) for dl in self.dataloaders]
         self.largest_dl = self.dataloaders[np.argmax(lens)]
-        for redundant_key in ["save_path", "num_processes", "drop_dataset_tail"]:
-            if redundant_key in data_loader_kwargs:
-                data_loader_kwargs.pop(redundant_key)
         super().__init__(self.largest_dl, drop_last=self._drop_dataset_tail, **data_loader_kwargs)
 
     def __len__(self):
diff --git a/src/scvi/dataloaders/_samplers.py b/src/scvi/dataloaders/_samplers.py
index 0392384684..1e866721b7 100644
--- a/src/scvi/dataloaders/_samplers.py
+++ b/src/scvi/dataloaders/_samplers.py
@@ -37,20 +37,9 @@ def __init__(
         drop_dataset_tail: bool = False,
         **kwargs,
     ):
-        if not torch.distributed.is_initialized() and torch.cuda.is_available():
-            # initializes the distributed backend that takes care of synchronizing processes
-            torch.distributed.init_process_group(
-                "nccl",  # backend that works on all systems
-                init_method="file://" + kwargs["save_path"] + "/dist_file",
-                rank=0,
-                world_size=kwargs["num_processes"],
-                store=None,
-            )
 
         for redundant_key in [
-            "save_path",
             "pin_memory",
-            "num_processes",
             "num_workers",
             "persistent_workers",
         ]:
diff --git a/tests/dataloaders/test_dataloaders.py b/tests/dataloaders/test_dataloaders.py
index 2000ec1f25..9ef80f71de 100644
--- a/tests/dataloaders/test_dataloaders.py
+++ b/tests/dataloaders/test_dataloaders.py
@@ -7,8 +7,12 @@
 
 import scvi
 from scvi import REGISTRY_KEYS
-from scvi.model import SCANVI
+from scvi.model import SCANVI, SCVI
 
+import sys
+
+if __name__ == "__main__" and "pytest" in sys.modules:
+    sys.argv = sys.argv[:1]  # Remove pytest arguments
 
 class TestSemiSupervisedTrainingPlan(scvi.train.SemiSupervisedTrainingPlan):
     def __init__(self, *args, **kwargs):
@@ -131,39 +135,3 @@ def test_anndataloader_distributed_sampler(num_processes: int, save_path: str):
             nprocs=num_processes,
             join=True,
         )
-
-
-@pytest.mark.parametrize("num_processes", [1, 2])
-def test_scanvi_with_distributed_sampler(num_processes: int, save_path: str):
-    if torch.cuda.is_available():
-        adata = scvi.data.synthetic_iid()
-        manager = generic_setup_adata_manager(adata)
-        SCANVI.setup_anndata(
-            adata,
-            "labels",
-            "label_0",
-            batch_key="batch",
-        )
-        file_path = save_path + "/dist_file"
-        if os.path.exists(file_path):  # Check if the file exists
-            os.remove(file_path)
-        datasplitter_kwargs = {}
-        # Multi-GPU settings
-        datasplitter_kwargs["distributed_sampler"] = True
-        datasplitter_kwargs["save_path"] = save_path
-        datasplitter_kwargs["num_processes"] = num_processes
-        datasplitter_kwargs["drop_dataset_tail"] = True
-        datasplitter_kwargs["drop_last"] = False
-        if num_processes == 1:
-            datasplitter_kwargs["distributed_sampler"] = False
-            datasplitter_kwargs["drop_dataset_tail"] = False
-        model = SCANVI(adata, n_latent=10)
-
-        torch.multiprocessing.spawn(
-            multiprocessing_worker,
-            args=(num_processes, manager, save_path, {}),
-            nprocs=num_processes,
-            join=True,
-        )
-
-        model.train(1, datasplitter_kwargs=datasplitter_kwargs)
diff --git a/tests/dataloaders/test_samplers.py b/tests/dataloaders/test_samplers.py
index 09a8a5a127..ae4861d98f 100644
--- a/tests/dataloaders/test_samplers.py
+++ b/tests/dataloaders/test_samplers.py
@@ -1,4 +1,3 @@
-import os
 from math import ceil, floor
 
 import numpy as np
@@ -9,10 +8,7 @@
 from scvi.dataloaders import BatchDistributedSampler
 
 
-@pytest.mark.parametrize("num_processes", [1, 2])
 def test_batchdistributedsampler_init(
-    num_processes: int,
-    save_path: str,
     batch_size: int = 128,
     n_batches: int = 2,
 ):
@@ -20,10 +16,6 @@ def test_batchdistributedsampler_init(
     manager = generic_setup_adata_manager(adata)
     dataset = manager.create_torch_dataset()
 
-    file_path = save_path + "/dist_file"
-    if os.path.exists(file_path):  # Check if the file exists
-        os.remove(file_path)
-
     sampler = BatchDistributedSampler(
         dataset,
         num_replicas=1,
@@ -32,8 +24,6 @@ def test_batchdistributedsampler_init(
         shuffle=True,
         drop_last=True,
         drop_dataset_tail=True,
-        num_processes=num_processes,
-        save_path=save_path,
     )
     assert sampler.batch_size == batch_size
     assert sampler.rank == 0
@@ -45,12 +35,9 @@ def test_batchdistributedsampler_init(
 
 @pytest.mark.parametrize("drop_last", [True, False])
 @pytest.mark.parametrize("drop_dataset_tail", [True, False])
-@pytest.mark.parametrize("num_processes", [1, 2])
 def test_batchdistributedsampler_drop_last(
-    num_processes: int,
     drop_last: bool,
     drop_dataset_tail: bool,
-    save_path: str,
     batch_size: int = 128,
     n_batches: int = 3,
     num_replicas: int = 2,
@@ -114,10 +101,6 @@ def check_samplers(samplers: list, sampler_batch_size: int):
             assert len(all_indices) == effective_n_obs_per_sampler
             assert [len(indices) for indices in batch_indices] == batch_sizes
 
-    file_path = save_path + "/dist_file"
-    if os.path.exists(file_path):  # Check if the file exists
-        os.remove(file_path)
-
     for sampler_batch_size in [batch_size, batch_size - 1, batch_size + 1]:
         samplers = [
             BatchDistributedSampler(
@@ -127,18 +110,13 @@ def check_samplers(samplers: list, sampler_batch_size: int):
                 batch_size=sampler_batch_size,
                 drop_last=drop_last,
                 drop_dataset_tail=drop_dataset_tail,
-                num_processes=num_processes,
-                save_path=save_path,
             )
             for i in range(num_replicas)
         ]
         check_samplers(samplers, sampler_batch_size)
 
 
-@pytest.mark.parametrize("num_processes", [1, 2])
 def test_batchdistributedsampler_indices(
-    num_processes: int,
-    save_path: str,
     batch_size: int = 128,
     n_batches: int = 3,
     num_replicas: int = 2,
@@ -147,18 +125,12 @@ def test_batchdistributedsampler_indices(
     manager = generic_setup_adata_manager(adata)
     dataset = manager.create_torch_dataset()
 
-    file_path = save_path + "/dist_file"
-    if os.path.exists(file_path):  # Check if the file exists
-        os.remove(file_path)
-
     samplers = [
         BatchDistributedSampler(
             dataset,
             num_replicas=num_replicas,
             rank=i,
             batch_size=batch_size,
-            num_processes=num_processes,
-            save_path=save_path,
         )
         for i in range(num_replicas)
     ]
diff --git a/tests/model/test_scanvi.py b/tests/model/test_scanvi.py
index aede994029..746a64ee31 100644
--- a/tests/model/test_scanvi.py
+++ b/tests/model/test_scanvi.py
@@ -5,6 +5,7 @@
 import pandas as pd
 import pytest
 import torch
+import subprocess
 
 from scvi.data import synthetic_iid
 from scvi.model import SCANVI, SCVI
@@ -578,3 +579,68 @@ def check_no_logits_and_softmax(model: SCANVI):
 
     model = SCANVI.load(resave_model_path, adata)
     check_no_logits_and_softmax(model)
+
+
+def test_scanvi_train_ddp():
+    training_code = """
+import torch
+import scvi
+from scvi.model import SCANVI
+
+adata = scvi.data.synthetic_iid()
+SCANVI.setup_anndata(
+    adata,
+    "labels",
+    "label_0",
+    batch_key="batch",
+)
+
+model = SCANVI(adata, n_latent=10)
+
+model.train(
+    max_epochs=100,
+    train_size=0.5,
+    check_val_every_n_epoch=1,
+    accelerator="gpu",
+    devices=-1,
+    strategy="ddp_find_unused_parameters_true",
+)
+
+torch.distributed.destroy_process_group()
+
+assert model.is_trained
+"""
+
+    if torch.cuda.is_available():
+
+        # Get the current working directory (CWD)
+        cwd = os.getcwd()
+
+        # Define the file path for the temporary script in the current working directory
+        temp_file_path = os.path.join(cwd, "train_scanvi_ddp_temp.py")
+
+        # Write the training code to the file in the current working directory
+        with open(temp_file_path, "w") as temp_file:
+            temp_file.write(training_code)
+            print(f"Temporary Python file created at: {temp_file_path}")
+
+        def launch_ddp(world_size, temp_file_path):
+            # Command to run the script via torchrun
+            command = [
+                "torchrun",
+                "--nproc_per_node="+str(world_size),  # Specify the number of GPUs
+                temp_file_path  # Your original script
+            ]
+            # Use subprocess to run the command
+            try:
+                # Run the command, wait for it to finish & clean up the temporary file
+                subprocess.run(command, check=True)
+            except subprocess.CalledProcessError as e:
+                os.remove(temp_file_path)
+                raise ValueError(
+                    f"Error occurred while running the DDP training: {e}"
+                )
+            finally:
+                os.remove(temp_file_path)
+
+        launch_ddp(torch.cuda.device_count(), temp_file_path)
diff --git a/tests/model/test_scvi.py b/tests/model/test_scvi.py
index 49fe18e531..44c62f2247 100644
--- a/tests/model/test_scvi.py
+++ b/tests/model/test_scvi.py
@@ -11,6 +11,7 @@
 from lightning.pytorch.callbacks import LearningRateMonitor
 from scipy.sparse import csr_matrix
 from torch.nn import Softplus
+import subprocess
 
 import scvi
 from scvi.data import _constants, synthetic_iid
@@ -1297,3 +1298,61 @@ def test_scvi_num_workers():
     model.get_reconstruction_error()
     model.get_normalized_expression(transform_batch="batch_1")
     model.get_normalized_expression(n_samples=2)
+
+
+def test_scvi_train_ddp():
+    training_code = """
+import torch
+import scvi
+from scvi.model import SCVI
+
+adata = scvi.data.synthetic_iid()
+SCVI.setup_anndata(adata)
+
+model = SCVI(adata)
+
+model.train(
+    max_epochs=100,
+    check_val_every_n_epoch=1,
+    accelerator="gpu",
+    devices=-1,
+    strategy="ddp_find_unused_parameters_true",
+)
+
+torch.distributed.destroy_process_group()
+
+assert model.is_trained
+"""
+
+    if torch.cuda.is_available():
+        # Get the current working directory (CWD)
+        cwd = os.getcwd()
+
+        # Define the file path for the temporary script in the current working directory
+        temp_file_path = os.path.join(cwd, "train_scvi_ddp_temp.py")
+
+        # Write the training code to the file in the current working directory
+        with open(temp_file_path, "w") as temp_file:
+            temp_file.write(training_code)
+            print(f"Temporary Python file created at: {temp_file_path}")
+
+        def launch_ddp(world_size, temp_file_path):
+            # Command to run the script via torchrun
+            command = [
+                "torchrun",
+                "--nproc_per_node="+str(world_size),  # Specify the number of GPUs
+                temp_file_path  # Your original script
+            ]
+            # Use subprocess to run the command
+            try:
+                # Run the command, wait for it to finish & clean up the temporary file
+                subprocess.run(command, check=True)
+            except subprocess.CalledProcessError as e:
+                os.remove(temp_file_path)
+                raise ValueError(
+                    f"Error occurred while running the DDP training: {e}"
+                )
+            finally:
+                os.remove(temp_file_path)
+
+        launch_ddp(torch.cuda.device_count(), temp_file_path)

From 67ec59b581c5c8a9583ca9db58b6aa5bc0136115 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 2 Dec 2024 13:52:56 +0000
Subject: [PATCH 13/42] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/scvi/dataloaders/_samplers.py     |  2 --
 tests/dataloaders/test_dataloaders.py |  5 ++---
 tests/model/test_scanvi.py            | 11 ++++-------
 tests/model/test_scvi.py              | 10 ++++------
 4 files changed, 10 insertions(+), 18 deletions(-)

diff --git a/src/scvi/dataloaders/_samplers.py b/src/scvi/dataloaders/_samplers.py
index 1e866721b7..8753c6feec 100644
--- a/src/scvi/dataloaders/_samplers.py
+++ b/src/scvi/dataloaders/_samplers.py
@@ -1,4 +1,3 @@
-import torch
 from torch.utils.data import Dataset, DistributedSampler
 
 
@@ -37,7 +36,6 @@ def __init__(
         drop_dataset_tail: bool = False,
         **kwargs,
     ):
-
         for redundant_key in [
             "pin_memory",
             "num_workers",
diff --git a/tests/dataloaders/test_dataloaders.py b/tests/dataloaders/test_dataloaders.py
index 9ef80f71de..33795890a8 100644
--- a/tests/dataloaders/test_dataloaders.py
+++ b/tests/dataloaders/test_dataloaders.py
@@ -1,4 +1,5 @@
 import os
+import sys
 
 import numpy as np
 import pytest
@@ -7,13 +8,11 @@
 
 import scvi
 from scvi import REGISTRY_KEYS
-from scvi.model import SCANVI, SCVI
-
-import sys
 
 if __name__ == "__main__" and "pytest" in sys.modules:
     sys.argv = sys.argv[:1]  # Remove pytest arguments
 
+
 class TestSemiSupervisedTrainingPlan(scvi.train.SemiSupervisedTrainingPlan):
     def __init__(self, *args, **kwargs):
         self.n_samples_per_label = kwargs.pop("n_samples_per_label")
diff --git a/tests/model/test_scanvi.py b/tests/model/test_scanvi.py
index 746a64ee31..5685806d60 100644
--- a/tests/model/test_scanvi.py
+++ b/tests/model/test_scanvi.py
@@ -1,11 +1,11 @@
 import os
 import pickle
+import subprocess
 
 import numpy as np
 import pandas as pd
 import pytest
 import torch
-import subprocess
 
 from scvi.data import synthetic_iid
 from scvi.model import SCANVI, SCVI
@@ -612,7 +612,6 @@ def test_scanvi_train_ddp():
 """
 
     if torch.cuda.is_available():
-
         # Get the current working directory (CWD)
         cwd = os.getcwd()
 
@@ -628,8 +627,8 @@ def launch_ddp(world_size, temp_file_path):
             # Command to run the script via torchrun
             command = [
                 "torchrun",
-                "--nproc_per_node="+str(world_size),  # Specify the number of GPUs
-                temp_file_path  # Your original script
+                "--nproc_per_node=" + str(world_size),  # Specify the number of GPUs
+                temp_file_path,  # Your original script
             ]
             # Use subprocess to run the command
             try:
@@ -637,9 +636,7 @@ def launch_ddp(world_size, temp_file_path):
                 subprocess.run(command, check=True)
             except subprocess.CalledProcessError as e:
                 os.remove(temp_file_path)
-                raise ValueError(
-                    f"Error occurred while running the DDP training: {e}"
-                )
+                raise ValueError(f"Error occurred while running the DDP training: {e}")
             finally:
                 os.remove(temp_file_path)
 
diff --git a/tests/model/test_scvi.py b/tests/model/test_scvi.py
index 44c62f2247..c181aa4220 100644
--- a/tests/model/test_scvi.py
+++ b/tests/model/test_scvi.py
@@ -1,6 +1,7 @@
 import inspect
 import os
 import pickle
+import subprocess
 import tarfile
 from unittest import mock
 
@@ -11,7 +12,6 @@
 from lightning.pytorch.callbacks import LearningRateMonitor
 from scipy.sparse import csr_matrix
 from torch.nn import Softplus
-import subprocess
 
 import scvi
 from scvi.data import _constants, synthetic_iid
@@ -1340,8 +1340,8 @@ def launch_ddp(world_size, temp_file_path):
             # Command to run the script via torchrun
             command = [
                 "torchrun",
-                "--nproc_per_node="+str(world_size),  # Specify the number of GPUs
-                temp_file_path  # Your original script
+                "--nproc_per_node=" + str(world_size),  # Specify the number of GPUs
+                temp_file_path,  # Your original script
             ]
             # Use subprocess to run the command
             try:
@@ -1349,9 +1349,7 @@ def launch_ddp(world_size, temp_file_path):
                 subprocess.run(command, check=True)
             except subprocess.CalledProcessError as e:
                 os.remove(temp_file_path)
-                raise ValueError(
-                    f"Error occurred while running the DDP training: {e}"
-                )
+                raise ValueError(f"Error occurred while running the DDP training: {e}")
             finally:
                 os.remove(temp_file_path)
 

From 054674e13bca0d8b7d616ea862e6fdd1cb74cd5e Mon Sep 17 00:00:00 2001
From: Ori Kronfeld <ori.kronfeld@weizmann.ac.il>
Date: Mon, 2 Dec 2024 18:08:53 +0200
Subject: [PATCH 14/42] tests fixes

---
 tests/model/test_models_with_minified_data.py | 6 +++---
 tests/model/test_scanvi.py                    | 3 ++-
 tests/model/test_scvi.py                      | 3 ++-
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/tests/model/test_models_with_minified_data.py b/tests/model/test_models_with_minified_data.py
index 52c9013362..e88efa14e6 100644
--- a/tests/model/test_models_with_minified_data.py
+++ b/tests/model/test_models_with_minified_data.py
@@ -281,10 +281,10 @@ def test_validate_supported_if_minified_keep_count():
     assert model.minified_data_type == ADATA_MINIFY_TYPE.LATENT_POSTERIOR_WITH_COUNTS
     assert model2.minified_data_type is None
 
-    assert np.allclose(model2.get_elbo(), model.get_elbo(), rtol=5e-2)
+    assert np.allclose(model2.get_elbo().cpu(), model.get_elbo().cpu(), rtol=5e-2)
     assert np.allclose(
-        model2.get_reconstruction_error()["reconstruction_loss"],
-        model.get_reconstruction_error()["reconstruction_loss"],
+        model2.get_reconstruction_error()["reconstruction_loss"].cpu(),
+        model.get_reconstruction_error()["reconstruction_loss"].cpu(),
         rtol=5e-2,
     )
     assert np.allclose(model2.get_marginal_ll(), model.get_marginal_ll(), rtol=5e-2)
diff --git a/tests/model/test_scanvi.py b/tests/model/test_scanvi.py
index 5685806d60..b7fd0864e8 100644
--- a/tests/model/test_scanvi.py
+++ b/tests/model/test_scanvi.py
@@ -636,7 +636,8 @@ def launch_ddp(world_size, temp_file_path):
                 subprocess.run(command, check=True)
             except subprocess.CalledProcessError as e:
                 os.remove(temp_file_path)
-                raise ValueError(f"Error occurred while running the DDP training: {e}")
+                print(f"Error occurred while running the DDP training: {e}")
+                raise
             finally:
                 os.remove(temp_file_path)
 
diff --git a/tests/model/test_scvi.py b/tests/model/test_scvi.py
index c181aa4220..f499fe7f75 100644
--- a/tests/model/test_scvi.py
+++ b/tests/model/test_scvi.py
@@ -1349,7 +1349,8 @@ def launch_ddp(world_size, temp_file_path):
                 subprocess.run(command, check=True)
             except subprocess.CalledProcessError as e:
                 os.remove(temp_file_path)
-                raise ValueError(f"Error occurred while running the DDP training: {e}")
+                print(f"Error occurred while running the DDP training: {e}")
+                raise
             finally:
                 os.remove(temp_file_path)
 

From 00f5c37d1da6f4f728abd6ebff84792a2afc3167 Mon Sep 17 00:00:00 2001
From: Ori Kronfeld <ori.kronfeld@weizmann.ac.il>
Date: Mon, 2 Dec 2024 21:34:53 +0200
Subject: [PATCH 15/42] revert scanvi distributed test

---
 src/scvi/model/_scanvi.py              |   3 +-
 tests/dataloaders/test_datasplitter.py |  18 +++-
 tests/model/test_scanvi.py             | 122 ++++++++++++-------------
 3 files changed, 79 insertions(+), 64 deletions(-)

diff --git a/src/scvi/model/_scanvi.py b/src/scvi/model/_scanvi.py
index 084d83be1f..836a22d492 100644
--- a/src/scvi/model/_scanvi.py
+++ b/src/scvi/model/_scanvi.py
@@ -25,7 +25,7 @@
     NumericalObsField,
 )
 from scvi.dataloaders import SemiSupervisedDataSplitter
-from scvi.model._utils import _init_library_size, get_max_epochs_heuristic
+from scvi.model._utils import _init_library_size, get_max_epochs_heuristic, use_distributed_sampler
 from scvi.module import SCANVAE
 from scvi.train import SemiSupervisedTrainingPlan, TrainRunner
 from scvi.train._callbacks import SubSampleLabels
@@ -411,6 +411,7 @@ def train(
             shuffle_set_split=shuffle_set_split,
             n_samples_per_label=n_samples_per_label,
             batch_size=batch_size,
+            #distributed_sampler=use_distributed_sampler(trainer_kwargs.get("strategy", None)),
             **datasplitter_kwargs,
         )
         training_plan = self._training_plan_cls(self.module, self.n_labels, **plan_kwargs)
diff --git a/tests/dataloaders/test_datasplitter.py b/tests/dataloaders/test_datasplitter.py
index c38e4ba57e..b8b9e8d72e 100644
--- a/tests/dataloaders/test_datasplitter.py
+++ b/tests/dataloaders/test_datasplitter.py
@@ -5,7 +5,7 @@
 import numpy as np
 import pytest
 from sparse_utils import TestSparseModel
-from tests.data.utils import generic_setup_adata_manager
+from tests.data.utils import generic_setup_adata_manager, scanvi_setup_adata_manager
 
 import scvi
 
@@ -17,7 +17,7 @@ def test_datasplitter_shuffle(self):
 
         with pytest.raises(ValueError) as excinfo:
             scvi.dataloaders.DataSplitter(
-                manager, train_size=1.5, validation_size=0.3, shuffle_set_split=False
+                manager, train_size=1.5, validation_size=0.3, shuffle_set_split=False,
             )
         assert str(excinfo.value) == "Invalid train_size. Must be: 0 < train_size <= 1"
 
@@ -188,6 +188,20 @@ def test_datasplitter_external_with_duplicates(self):
             scvi.dataloaders.DataSplitter(manager, external_indexing=[train_ind])
         assert str(excinfo.value) == "There are duplicate indexing in train set"
 
+    def test_datasplitter_distributed_sampler(self):
+        adata = scvi.data.synthetic_iid()
+        manager = generic_setup_adata_manager(adata)
+        datasplitter_kwargs = {}
+        datasplitter_kwargs['distributed_sampler'] = True
+
+        scvi.dataloaders.DataSplitter(manager, **datasplitter_kwargs,)
+
+    def test_semisupervised_datasplitter_distributed_sampler(self):
+        adata = scvi.data.synthetic_iid()
+        manager = scanvi_setup_adata_manager(adata, labels_key="labels",unlabeled_category="label_0")
+        datasplitter_kwargs = {}
+        datasplitter_kwargs['distributed_sampler'] = True
+        scvi.dataloaders.SemiSupervisedDataSplitter(adata_manager=manager, **datasplitter_kwargs,)
 
 @pytest.mark.parametrize("sparse_format", ["csr_matrix", "csc_matrix"])
 def test_datasplitter_load_sparse_tensor(
diff --git a/tests/model/test_scanvi.py b/tests/model/test_scanvi.py
index b7fd0864e8..199d895abc 100644
--- a/tests/model/test_scanvi.py
+++ b/tests/model/test_scanvi.py
@@ -581,64 +581,64 @@ def check_no_logits_and_softmax(model: SCANVI):
     check_no_logits_and_softmax(model)
 
 
-def test_scanvi_train_ddp():
-    training_code = """
-import torch
-import scvi
-from scvi.model import SCANVI
-
-adata = scvi.data.synthetic_iid()
-SCANVI.setup_anndata(
-    adata,
-    "labels",
-    "label_0",
-    batch_key="batch",
-)
-
-model = SCANVI(adata, n_latent=10)
-
-model.train(
-    max_epochs=100,
-    train_size=0.5,
-    check_val_every_n_epoch=1,
-    accelerator="gpu",
-    devices=-1,
-    strategy="ddp_find_unused_parameters_true",
-)
-
-torch.distributed.destroy_process_group()
-
-assert model.is_trained
-"""
-
-    if torch.cuda.is_available():
-        # Get the current working directory (CWD)
-        cwd = os.getcwd()
-
-        # Define the file path for the temporary script in the current working directory
-        temp_file_path = os.path.join(cwd, "train_scanvi_ddp_temp.py")
-
-        # Write the training code to the file in the current working directory
-        with open(temp_file_path, "w") as temp_file:
-            temp_file.write(training_code)
-            print(f"Temporary Python file created at: {temp_file_path}")
-
-        def launch_ddp(world_size, temp_file_path):
-            # Command to run the script via torchrun
-            command = [
-                "torchrun",
-                "--nproc_per_node=" + str(world_size),  # Specify the number of GPUs
-                temp_file_path,  # Your original script
-            ]
-            # Use subprocess to run the command
-            try:
-                # Run the command, wait for it to finish & clean up the temporary file
-                subprocess.run(command, check=True)
-            except subprocess.CalledProcessError as e:
-                os.remove(temp_file_path)
-                print(f"Error occurred while running the DDP training: {e}")
-                raise
-            finally:
-                os.remove(temp_file_path)
-
-        launch_ddp(torch.cuda.device_count(), temp_file_path)
+# def test_scanvi_train_ddp():
+#     training_code = """
+# import torch
+# import scvi
+# from scvi.model import SCANVI
+#
+# adata = scvi.data.synthetic_iid()
+# SCANVI.setup_anndata(
+#     adata,
+#     "labels",
+#     "label_0",
+#     batch_key="batch",
+# )
+#
+# model = SCANVI(adata, n_latent=10)
+#
+# model.train(
+#     max_epochs=100,
+#     train_size=0.5,
+#     check_val_every_n_epoch=1,
+#     accelerator="gpu",
+#     devices=-1,
+#     strategy="ddp_find_unused_parameters_true",
+# )
+#
+# torch.distributed.destroy_process_group()
+#
+# assert model.is_trained
+# """
+#
+#     if torch.cuda.is_available():
+#         # Get the current working directory (CWD)
+#         cwd = os.getcwd()
+#
+#         # Define the file path for the temporary script in the current working directory
+#         temp_file_path = os.path.join(cwd, "train_scanvi_ddp_temp.py")
+#
+#         # Write the training code to the file in the current working directory
+#         with open(temp_file_path, "w") as temp_file:
+#             temp_file.write(training_code)
+#             print(f"Temporary Python file created at: {temp_file_path}")
+#
+#         def launch_ddp(world_size, temp_file_path):
+#             # Command to run the script via torchrun
+#             command = [
+#                 "torchrun",
+#                 "--nproc_per_node=" + str(world_size),  # Specify the number of GPUs
+#                 temp_file_path,  # Your original script
+#             ]
+#             # Use subprocess to run the command
+#             try:
+#                 # Run the command, wait for it to finish & clean up the temporary file
+#                 subprocess.run(command, check=True)
+#             except subprocess.CalledProcessError as e:
+#                 os.remove(temp_file_path)
+#                 print(f"Error occurred while running the DDP training: {e}")
+#                 raise
+#             finally:
+#                 os.remove(temp_file_path)
+#
+#         launch_ddp(torch.cuda.device_count(), temp_file_path)

From 19b382f40e44b42ee092d068bd6c358ec8a8cb24 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 2 Dec 2024 19:35:14 +0000
Subject: [PATCH 16/42] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/scvi/model/_scanvi.py              |  4 ++--
 tests/dataloaders/test_datasplitter.py | 24 ++++++++++++++++++------
 tests/model/test_scanvi.py             |  1 -
 3 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/src/scvi/model/_scanvi.py b/src/scvi/model/_scanvi.py
index 836a22d492..edd75e5324 100644
--- a/src/scvi/model/_scanvi.py
+++ b/src/scvi/model/_scanvi.py
@@ -25,7 +25,7 @@
     NumericalObsField,
 )
 from scvi.dataloaders import SemiSupervisedDataSplitter
-from scvi.model._utils import _init_library_size, get_max_epochs_heuristic, use_distributed_sampler
+from scvi.model._utils import _init_library_size, get_max_epochs_heuristic
 from scvi.module import SCANVAE
 from scvi.train import SemiSupervisedTrainingPlan, TrainRunner
 from scvi.train._callbacks import SubSampleLabels
@@ -411,7 +411,7 @@ def train(
             shuffle_set_split=shuffle_set_split,
             n_samples_per_label=n_samples_per_label,
             batch_size=batch_size,
-            #distributed_sampler=use_distributed_sampler(trainer_kwargs.get("strategy", None)),
+            # distributed_sampler=use_distributed_sampler(trainer_kwargs.get("strategy", None)),
             **datasplitter_kwargs,
         )
         training_plan = self._training_plan_cls(self.module, self.n_labels, **plan_kwargs)
diff --git a/tests/dataloaders/test_datasplitter.py b/tests/dataloaders/test_datasplitter.py
index b8b9e8d72e..aad5d54b36 100644
--- a/tests/dataloaders/test_datasplitter.py
+++ b/tests/dataloaders/test_datasplitter.py
@@ -17,7 +17,10 @@ def test_datasplitter_shuffle(self):
 
         with pytest.raises(ValueError) as excinfo:
             scvi.dataloaders.DataSplitter(
-                manager, train_size=1.5, validation_size=0.3, shuffle_set_split=False,
+                manager,
+                train_size=1.5,
+                validation_size=0.3,
+                shuffle_set_split=False,
             )
         assert str(excinfo.value) == "Invalid train_size. Must be: 0 < train_size <= 1"
 
@@ -192,16 +195,25 @@ def test_datasplitter_distributed_sampler(self):
         adata = scvi.data.synthetic_iid()
         manager = generic_setup_adata_manager(adata)
         datasplitter_kwargs = {}
-        datasplitter_kwargs['distributed_sampler'] = True
+        datasplitter_kwargs["distributed_sampler"] = True
 
-        scvi.dataloaders.DataSplitter(manager, **datasplitter_kwargs,)
+        scvi.dataloaders.DataSplitter(
+            manager,
+            **datasplitter_kwargs,
+        )
 
     def test_semisupervised_datasplitter_distributed_sampler(self):
         adata = scvi.data.synthetic_iid()
-        manager = scanvi_setup_adata_manager(adata, labels_key="labels",unlabeled_category="label_0")
+        manager = scanvi_setup_adata_manager(
+            adata, labels_key="labels", unlabeled_category="label_0"
+        )
         datasplitter_kwargs = {}
-        datasplitter_kwargs['distributed_sampler'] = True
-        scvi.dataloaders.SemiSupervisedDataSplitter(adata_manager=manager, **datasplitter_kwargs,)
+        datasplitter_kwargs["distributed_sampler"] = True
+        scvi.dataloaders.SemiSupervisedDataSplitter(
+            adata_manager=manager,
+            **datasplitter_kwargs,
+        )
+
 
 @pytest.mark.parametrize("sparse_format", ["csr_matrix", "csc_matrix"])
 def test_datasplitter_load_sparse_tensor(
diff --git a/tests/model/test_scanvi.py b/tests/model/test_scanvi.py
index 199d895abc..7d8fced0d2 100644
--- a/tests/model/test_scanvi.py
+++ b/tests/model/test_scanvi.py
@@ -1,6 +1,5 @@
 import os
 import pickle
-import subprocess
 
 import numpy as np
 import pandas as pd

From 523941368e8641e19f0831d01a77493c8438d89d Mon Sep 17 00:00:00 2001
From: Ori Kronfeld <ori.kronfeld@weizmann.ac.il>
Date: Mon, 2 Dec 2024 21:36:37 +0200
Subject: [PATCH 17/42] revert scanvi distributed test

---
 tests/dataloaders/test_dataloaders.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tests/dataloaders/test_dataloaders.py b/tests/dataloaders/test_dataloaders.py
index 33795890a8..12802a02da 100644
--- a/tests/dataloaders/test_dataloaders.py
+++ b/tests/dataloaders/test_dataloaders.py
@@ -9,10 +9,6 @@
 import scvi
 from scvi import REGISTRY_KEYS
 
-if __name__ == "__main__" and "pytest" in sys.modules:
-    sys.argv = sys.argv[:1]  # Remove pytest arguments
-
-
 class TestSemiSupervisedTrainingPlan(scvi.train.SemiSupervisedTrainingPlan):
     def __init__(self, *args, **kwargs):
         self.n_samples_per_label = kwargs.pop("n_samples_per_label")

From d61aebe1c000de8f9002ae6b3d5742f378a0e899 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 2 Dec 2024 19:38:25 +0000
Subject: [PATCH 18/42] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/dataloaders/test_dataloaders.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/dataloaders/test_dataloaders.py b/tests/dataloaders/test_dataloaders.py
index 12802a02da..60e5e7ea34 100644
--- a/tests/dataloaders/test_dataloaders.py
+++ b/tests/dataloaders/test_dataloaders.py
@@ -1,5 +1,4 @@
 import os
-import sys
 
 import numpy as np
 import pytest
@@ -9,6 +8,7 @@
 import scvi
 from scvi import REGISTRY_KEYS
 
+
 class TestSemiSupervisedTrainingPlan(scvi.train.SemiSupervisedTrainingPlan):
     def __init__(self, *args, **kwargs):
         self.n_samples_per_label = kwargs.pop("n_samples_per_label")

From 0d424656acd0288e4155b023eab22a3a9be43e6c Mon Sep 17 00:00:00 2001
From: Ori Kronfeld <ori.kronfeld@weizmann.ac.il>
Date: Tue, 3 Dec 2024 09:54:26 +0200
Subject: [PATCH 19/42] Fixed tests

---
 tests/dataloaders/test_dataloaders.py  | 35 ++++++++++++++++++++++++++
 tests/dataloaders/test_datasplitter.py | 25 +-----------------
 tests/model/test_scvi.py               |  7 ++----
 3 files changed, 38 insertions(+), 29 deletions(-)

diff --git a/tests/dataloaders/test_dataloaders.py b/tests/dataloaders/test_dataloaders.py
index 60e5e7ea34..737d674dd7 100644
--- a/tests/dataloaders/test_dataloaders.py
+++ b/tests/dataloaders/test_dataloaders.py
@@ -4,6 +4,7 @@
 import pytest
 import torch
 from tests.data.utils import generic_setup_adata_manager
+from scvi.model import SCANVI
 
 import scvi
 from scvi import REGISTRY_KEYS
@@ -130,3 +131,37 @@ def test_anndataloader_distributed_sampler(num_processes: int, save_path: str):
             nprocs=num_processes,
             join=True,
         )
+
+
+@pytest.mark.parametrize("num_processes", [1])
+def test_scanvi_with_distributed_sampler(num_processes: int, save_path: str):
+    if torch.cuda.is_available():
+        adata = scvi.data.synthetic_iid()
+        manager = generic_setup_adata_manager(adata)
+        SCANVI.setup_anndata(
+            adata,
+            "labels",
+            "label_0",
+            batch_key="batch",
+        )
+        file_path = save_path + "/dist_file"
+        if os.path.exists(file_path):  # Check if the file exists
+            os.remove(file_path)
+        datasplitter_kwargs = {}
+        # Multi-GPU settings
+        datasplitter_kwargs["distributed_sampler"] = True
+        datasplitter_kwargs["drop_last"] = False
+        if num_processes == 1:
+            datasplitter_kwargs["distributed_sampler"] = False
+        model = SCANVI(adata, n_latent=10)
+
+        # initializes the distributed backend that takes care of synchronizing processes
+        torch.distributed.init_process_group(
+            "nccl",  # backend that works on all systems
+            init_method=f"file://{save_path}/dist_file",
+            rank=0,
+            world_size=num_processes,
+            store=None,
+        )
+
+        model.train(1, datasplitter_kwargs=datasplitter_kwargs)
diff --git a/tests/dataloaders/test_datasplitter.py b/tests/dataloaders/test_datasplitter.py
index aad5d54b36..3162194419 100644
--- a/tests/dataloaders/test_datasplitter.py
+++ b/tests/dataloaders/test_datasplitter.py
@@ -5,7 +5,7 @@
 import numpy as np
 import pytest
 from sparse_utils import TestSparseModel
-from tests.data.utils import generic_setup_adata_manager, scanvi_setup_adata_manager
+from tests.data.utils import generic_setup_adata_manager
 
 import scvi
 
@@ -191,29 +191,6 @@ def test_datasplitter_external_with_duplicates(self):
             scvi.dataloaders.DataSplitter(manager, external_indexing=[train_ind])
         assert str(excinfo.value) == "There are duplicate indexing in train set"
 
-    def test_datasplitter_distributed_sampler(self):
-        adata = scvi.data.synthetic_iid()
-        manager = generic_setup_adata_manager(adata)
-        datasplitter_kwargs = {}
-        datasplitter_kwargs["distributed_sampler"] = True
-
-        scvi.dataloaders.DataSplitter(
-            manager,
-            **datasplitter_kwargs,
-        )
-
-    def test_semisupervised_datasplitter_distributed_sampler(self):
-        adata = scvi.data.synthetic_iid()
-        manager = scanvi_setup_adata_manager(
-            adata, labels_key="labels", unlabeled_category="label_0"
-        )
-        datasplitter_kwargs = {}
-        datasplitter_kwargs["distributed_sampler"] = True
-        scvi.dataloaders.SemiSupervisedDataSplitter(
-            adata_manager=manager,
-            **datasplitter_kwargs,
-        )
-
 
 @pytest.mark.parametrize("sparse_format", ["csr_matrix", "csc_matrix"])
 def test_datasplitter_load_sparse_tensor(
diff --git a/tests/model/test_scvi.py b/tests/model/test_scvi.py
index f499fe7f75..334665a31b 100644
--- a/tests/model/test_scvi.py
+++ b/tests/model/test_scvi.py
@@ -1300,7 +1300,7 @@ def test_scvi_num_workers():
     model.get_normalized_expression(n_samples=2)
 
 
-def test_scvi_train_ddp():
+def test_scvi_train_ddp(save_path: str):
     training_code = """
 import torch
 import scvi
@@ -1325,11 +1325,8 @@ def test_scvi_train_ddp():
 """
 
     if torch.cuda.is_available():
-        # Get the current working directory (CWD)
-        cwd = os.getcwd()
-
         # Define the file path for the temporary script in the current working directory
-        temp_file_path = os.path.join(cwd, "train_scvi_ddp_temp.py")
+        temp_file_path = os.path.join(save_path, "train_scvi_ddp_temp.py")
 
         # Write the training code to the file in the current working directory
         with open(temp_file_path, "w") as temp_file:

From f8e44b5318a306384ac6cea3af9d8ac829543549 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 3 Dec 2024 07:54:43 +0000
Subject: [PATCH 20/42] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/dataloaders/test_dataloaders.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/dataloaders/test_dataloaders.py b/tests/dataloaders/test_dataloaders.py
index 737d674dd7..e4f62b1699 100644
--- a/tests/dataloaders/test_dataloaders.py
+++ b/tests/dataloaders/test_dataloaders.py
@@ -4,10 +4,10 @@
 import pytest
 import torch
 from tests.data.utils import generic_setup_adata_manager
-from scvi.model import SCANVI
 
 import scvi
 from scvi import REGISTRY_KEYS
+from scvi.model import SCANVI
 
 
 class TestSemiSupervisedTrainingPlan(scvi.train.SemiSupervisedTrainingPlan):

From 7360989982e6b3c80bd4b893a58e5514d52b3ce3 Mon Sep 17 00:00:00 2001
From: Ori Kronfeld <ori.kronfeld@weizmann.ac.il>
Date: Tue, 3 Dec 2024 09:59:56 +0200
Subject: [PATCH 21/42] fix pre commit

---
 tests/dataloaders/test_dataloaders.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/dataloaders/test_dataloaders.py b/tests/dataloaders/test_dataloaders.py
index e4f62b1699..f6418934b9 100644
--- a/tests/dataloaders/test_dataloaders.py
+++ b/tests/dataloaders/test_dataloaders.py
@@ -137,7 +137,7 @@ def test_anndataloader_distributed_sampler(num_processes: int, save_path: str):
 def test_scanvi_with_distributed_sampler(num_processes: int, save_path: str):
     if torch.cuda.is_available():
         adata = scvi.data.synthetic_iid()
-        manager = generic_setup_adata_manager(adata)
+        #manager = scanvi_setup_adata_manager(adata)
         SCANVI.setup_anndata(
             adata,
             "labels",

From c16358b0fcdf33c122947bc6253841371bd5362e Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 3 Dec 2024 08:00:21 +0000
Subject: [PATCH 22/42] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/dataloaders/test_dataloaders.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/dataloaders/test_dataloaders.py b/tests/dataloaders/test_dataloaders.py
index f6418934b9..54b15cdebd 100644
--- a/tests/dataloaders/test_dataloaders.py
+++ b/tests/dataloaders/test_dataloaders.py
@@ -137,7 +137,7 @@ def test_anndataloader_distributed_sampler(num_processes: int, save_path: str):
 def test_scanvi_with_distributed_sampler(num_processes: int, save_path: str):
     if torch.cuda.is_available():
         adata = scvi.data.synthetic_iid()
-        #manager = scanvi_setup_adata_manager(adata)
+        # manager = scanvi_setup_adata_manager(adata)
         SCANVI.setup_anndata(
             adata,
             "labels",

From b37efd5f532c4a9d35bacaf72f16cc49f1ab36bb Mon Sep 17 00:00:00 2001
From: Ori Kronfeld <ori.kronfeld@weizmann.ac.il>
Date: Tue, 3 Dec 2024 14:01:15 +0200
Subject: [PATCH 23/42] test precommit

---
 tests/dataloaders/test_dataloaders.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/dataloaders/test_dataloaders.py b/tests/dataloaders/test_dataloaders.py
index 54b15cdebd..7632a720be 100644
--- a/tests/dataloaders/test_dataloaders.py
+++ b/tests/dataloaders/test_dataloaders.py
@@ -3,7 +3,7 @@
 import numpy as np
 import pytest
 import torch
-from tests.data.utils import generic_setup_adata_manager
+from tests.data.utils import generic_setup_adata_manager, scanvi_setup_adata_manager
 
 import scvi
 from scvi import REGISTRY_KEYS
@@ -137,7 +137,7 @@ def test_anndataloader_distributed_sampler(num_processes: int, save_path: str):
 def test_scanvi_with_distributed_sampler(num_processes: int, save_path: str):
     if torch.cuda.is_available():
         adata = scvi.data.synthetic_iid()
-        # manager = scanvi_setup_adata_manager(adata)
+        manager = scanvi_setup_adata_manager(adata)
         SCANVI.setup_anndata(
             adata,
             "labels",

From 524516305ca1326463928b2227f49162ccfb5712 Mon Sep 17 00:00:00 2001
From: Ori Kronfeld <ori.kronfeld@weizmann.ac.il>
Date: Tue, 3 Dec 2024 14:10:22 +0200
Subject: [PATCH 24/42] test precommit

---
 tests/dataloaders/test_dataloaders.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/dataloaders/test_dataloaders.py b/tests/dataloaders/test_dataloaders.py
index 7632a720be..0c6bec9804 100644
--- a/tests/dataloaders/test_dataloaders.py
+++ b/tests/dataloaders/test_dataloaders.py
@@ -3,7 +3,7 @@
 import numpy as np
 import pytest
 import torch
-from tests.data.utils import generic_setup_adata_manager, scanvi_setup_adata_manager
+from tests.data.utils import generic_setup_adata_manager
 
 import scvi
 from scvi import REGISTRY_KEYS
@@ -137,7 +137,6 @@ def test_anndataloader_distributed_sampler(num_processes: int, save_path: str):
 def test_scanvi_with_distributed_sampler(num_processes: int, save_path: str):
     if torch.cuda.is_available():
         adata = scvi.data.synthetic_iid()
-        manager = scanvi_setup_adata_manager(adata)
         SCANVI.setup_anndata(
             adata,
             "labels",

From 6f58e43478822e6324d6971bd1d919a2c40696b8 Mon Sep 17 00:00:00 2001
From: Ori Kronfeld <ori.kronfeld@weizmann.ac.il>
Date: Tue, 3 Dec 2024 14:49:39 +0200
Subject: [PATCH 25/42] fix cuda test file

---
 .github/workflows/test_linux_cuda.yml | 2 +-
 tests/model/test_scvi.py              | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/test_linux_cuda.yml b/.github/workflows/test_linux_cuda.yml
index 0eca4d65c8..d0bc164c9f 100644
--- a/.github/workflows/test_linux_cuda.yml
+++ b/.github/workflows/test_linux_cuda.yml
@@ -59,7 +59,7 @@ jobs:
         run: |
           python -m pip install --upgrade pip wheel uv
           python -m uv pip install --system "scvi-tools[tests] @ ."
-          python -m pip install jax[cuda]
+          python -m pip install jax[cuda12]
           python -m pip install nvidia-nccl-cu12
 
       - name: Run pytest
diff --git a/tests/model/test_scvi.py b/tests/model/test_scvi.py
index 334665a31b..439976715f 100644
--- a/tests/model/test_scvi.py
+++ b/tests/model/test_scvi.py
@@ -1300,7 +1300,7 @@ def test_scvi_num_workers():
     model.get_normalized_expression(n_samples=2)
 
 
-def test_scvi_train_ddp(save_path: str):
+def test_scvi_train_ddp(save_path: str = "."):
     training_code = """
 import torch
 import scvi
@@ -1312,7 +1312,7 @@ def test_scvi_train_ddp(save_path: str):
 model = SCVI(adata)
 
 model.train(
-    max_epochs=100,
+    max_epochs=1,
     check_val_every_n_epoch=1,
     accelerator="gpu",
     devices=-1,

From 2b9fe3e74c5b26c033583e13766192dc8451742b Mon Sep 17 00:00:00 2001
From: Ori Kronfeld <ori.kronfeld@weizmann.ac.il>
Date: Tue, 3 Dec 2024 15:59:04 +0200
Subject: [PATCH 26/42] added jax accelerator for tests

---
 tests/model/test_jaxscvi.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/tests/model/test_jaxscvi.py b/tests/model/test_jaxscvi.py
index dac25b7c89..055d6b8dd0 100644
--- a/tests/model/test_jaxscvi.py
+++ b/tests/model/test_jaxscvi.py
@@ -2,6 +2,7 @@
 
 import numpy as np
 import pytest
+import torch
 from flax import linen as nn
 
 from scvi.data import synthetic_iid
@@ -10,17 +11,19 @@
 
 
 def test_jax_scvi(n_latent=5):
+    accelerator = "gpu" if torch.cuda.is_available() else "cpu"
+
     adata = synthetic_iid()
     JaxSCVI.setup_anndata(
         adata,
         batch_key="batch",
     )
     model = JaxSCVI(adata, n_latent=n_latent)
-    model.train(2, train_size=0.5, check_val_every_n_epoch=1)
+    model.train(2, train_size=0.5, check_val_every_n_epoch=1, accelerator=accelerator)
     model.get_latent_representation()
 
     model = JaxSCVI(adata, n_latent=n_latent, gene_likelihood="poisson")
-    model.train(1, train_size=0.5)
+    model.train(1, train_size=0.5, accelerator=accelerator)
     z1 = model.get_latent_representation(give_mean=True, n_samples=1)
     assert z1.ndim == 2
     z2 = model.get_latent_representation(give_mean=False, n_samples=15)
@@ -29,6 +32,7 @@ def test_jax_scvi(n_latent=5):
 
 
 def test_jax_scvi_training(n_latent: int = 5, dropout_rate: float = 0.1):
+    accelerator = "gpu" if torch.cuda.is_available() else "cpu"
     adata = synthetic_iid()
     JaxSCVI.setup_anndata(
         adata,
@@ -42,7 +46,7 @@ def test_jax_scvi_training(n_latent: int = 5, dropout_rate: float = 0.1):
         mock_dropout = mock.Mock()
         mock_dropout.side_effect = lambda h, **_kwargs: h
         mock_dropout_cls.return_value = mock_dropout
-        model.train(1, train_size=0.5, check_val_every_n_epoch=1)
+        model.train(1, train_size=0.5, check_val_every_n_epoch=1, accelerator=accelerator)
 
         assert not model.module.training
         mock_dropout_cls.assert_called()
@@ -53,13 +57,14 @@ def test_jax_scvi_training(n_latent: int = 5, dropout_rate: float = 0.1):
 
 
 def test_jax_scvi_save_load(save_path: str, n_latent: int = 5):
+    accelerator = "gpu" if torch.cuda.is_available() else "cpu"
     adata = synthetic_iid()
     JaxSCVI.setup_anndata(
         adata,
         batch_key="batch",
     )
     model = JaxSCVI(adata, n_latent=n_latent)
-    model.train(2, train_size=0.5, check_val_every_n_epoch=1)
+    model.train(2, train_size=0.5, check_val_every_n_epoch=1, accelerator=accelerator)
     z1 = model.get_latent_representation(adata)
     model.save(save_path, overwrite=True, save_anndata=True)
     model.view_setup_args(save_path)

From 6f51b8e5d3b879616daef341dd6e3a13f3e21a50 Mon Sep 17 00:00:00 2001
From: Ori Kronfeld <ori.kronfeld@weizmann.ac.il>
Date: Tue, 3 Dec 2024 16:09:44 +0200
Subject: [PATCH 27/42] make jax tests private so they will not run here

---
 tests/model/test_jaxscvi.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/model/test_jaxscvi.py b/tests/model/test_jaxscvi.py
index 055d6b8dd0..1034b03956 100644
--- a/tests/model/test_jaxscvi.py
+++ b/tests/model/test_jaxscvi.py
@@ -10,6 +10,7 @@
 from scvi.utils import attrdict
 
 
+@pytest.mark.private
 def test_jax_scvi(n_latent=5):
     accelerator = "gpu" if torch.cuda.is_available() else "cpu"
 
@@ -31,6 +32,7 @@ def test_jax_scvi(n_latent=5):
     assert z2.shape[0] == 15
 
 
+@pytest.mark.private
 def test_jax_scvi_training(n_latent: int = 5, dropout_rate: float = 0.1):
     accelerator = "gpu" if torch.cuda.is_available() else "cpu"
     adata = synthetic_iid()
@@ -56,6 +58,7 @@ def test_jax_scvi_training(n_latent: int = 5, dropout_rate: float = 0.1):
         )
 
 
+@pytest.mark.private
 def test_jax_scvi_save_load(save_path: str, n_latent: int = 5):
     accelerator = "gpu" if torch.cuda.is_available() else "cpu"
     adata = synthetic_iid()

From 7cd2d437d7f4ab7356b6e15b0c8907491c5c4439 Mon Sep 17 00:00:00 2001
From: Ori Kronfeld <ori.kronfeld@weizmann.ac.il>
Date: Tue, 3 Dec 2024 16:14:21 +0200
Subject: [PATCH 28/42] fix test scvi ddp

---
 tests/model/test_scvi.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/model/test_scvi.py b/tests/model/test_scvi.py
index 439976715f..44e15ddfcc 100644
--- a/tests/model/test_scvi.py
+++ b/tests/model/test_scvi.py
@@ -1300,7 +1300,7 @@ def test_scvi_num_workers():
     model.get_normalized_expression(n_samples=2)
 
 
-def test_scvi_train_ddp(save_path: str = "."):
+def test_scvi_train_ddp(save_path: str):
     training_code = """
 import torch
 import scvi

From 72e4682e7f49d3d23711ab60a1de14e301a012e6 Mon Sep 17 00:00:00 2001
From: Ori Kronfeld <ori.kronfeld@weizmann.ac.il>
Date: Tue, 3 Dec 2024 16:26:45 +0200
Subject: [PATCH 29/42] revert jax tests

---
 tests/model/test_jaxscvi.py | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/tests/model/test_jaxscvi.py b/tests/model/test_jaxscvi.py
index 1034b03956..0d4a97a914 100644
--- a/tests/model/test_jaxscvi.py
+++ b/tests/model/test_jaxscvi.py
@@ -2,7 +2,8 @@
 
 import numpy as np
 import pytest
-import torch
+
+# import torch
 from flax import linen as nn
 
 from scvi.data import synthetic_iid
@@ -10,9 +11,8 @@
 from scvi.utils import attrdict
 
 
-@pytest.mark.private
 def test_jax_scvi(n_latent=5):
-    accelerator = "gpu" if torch.cuda.is_available() else "cpu"
+    # accelerator = "gpu" if torch.cuda.is_available() else "cpu"
 
     adata = synthetic_iid()
     JaxSCVI.setup_anndata(
@@ -20,11 +20,11 @@ def test_jax_scvi(n_latent=5):
         batch_key="batch",
     )
     model = JaxSCVI(adata, n_latent=n_latent)
-    model.train(2, train_size=0.5, check_val_every_n_epoch=1, accelerator=accelerator)
+    model.train(2, train_size=0.5, check_val_every_n_epoch=1, accelerator="cpu")
     model.get_latent_representation()
 
     model = JaxSCVI(adata, n_latent=n_latent, gene_likelihood="poisson")
-    model.train(1, train_size=0.5, accelerator=accelerator)
+    model.train(1, train_size=0.5, accelerator="cpu")
     z1 = model.get_latent_representation(give_mean=True, n_samples=1)
     assert z1.ndim == 2
     z2 = model.get_latent_representation(give_mean=False, n_samples=15)
@@ -32,9 +32,8 @@ def test_jax_scvi(n_latent=5):
     assert z2.shape[0] == 15
 
 
-@pytest.mark.private
 def test_jax_scvi_training(n_latent: int = 5, dropout_rate: float = 0.1):
-    accelerator = "gpu" if torch.cuda.is_available() else "cpu"
+    # accelerator = "gpu" if torch.cuda.is_available() else "cpu"
     adata = synthetic_iid()
     JaxSCVI.setup_anndata(
         adata,
@@ -48,7 +47,7 @@ def test_jax_scvi_training(n_latent: int = 5, dropout_rate: float = 0.1):
         mock_dropout = mock.Mock()
         mock_dropout.side_effect = lambda h, **_kwargs: h
         mock_dropout_cls.return_value = mock_dropout
-        model.train(1, train_size=0.5, check_val_every_n_epoch=1, accelerator=accelerator)
+        model.train(1, train_size=0.5, check_val_every_n_epoch=1, accelerator="cpu")
 
         assert not model.module.training
         mock_dropout_cls.assert_called()
@@ -58,16 +57,15 @@ def test_jax_scvi_training(n_latent: int = 5, dropout_rate: float = 0.1):
         )
 
 
-@pytest.mark.private
 def test_jax_scvi_save_load(save_path: str, n_latent: int = 5):
-    accelerator = "gpu" if torch.cuda.is_available() else "cpu"
+    # accelerator = "gpu" if torch.cuda.is_available() else "cpu"
     adata = synthetic_iid()
     JaxSCVI.setup_anndata(
         adata,
         batch_key="batch",
     )
     model = JaxSCVI(adata, n_latent=n_latent)
-    model.train(2, train_size=0.5, check_val_every_n_epoch=1, accelerator=accelerator)
+    model.train(2, train_size=0.5, check_val_every_n_epoch=1, accelerator="cpu")
     z1 = model.get_latent_representation(adata)
     model.save(save_path, overwrite=True, save_anndata=True)
     model.view_setup_args(save_path)

From 861e5892ac24128e35fab33d80158dd1defa95d1 Mon Sep 17 00:00:00 2001
From: Ori Kronfeld <ori.kronfeld@weizmann.ac.il>
Date: Tue, 3 Dec 2024 23:41:33 +0200
Subject: [PATCH 30/42] comment out the ddp test?

---
 tests/model/test_jaxscvi.py |   8 +--
 tests/model/test_scvi.py    | 105 ++++++++++++++++++------------------
 2 files changed, 56 insertions(+), 57 deletions(-)

diff --git a/tests/model/test_jaxscvi.py b/tests/model/test_jaxscvi.py
index 0d4a97a914..1d6ee6b313 100644
--- a/tests/model/test_jaxscvi.py
+++ b/tests/model/test_jaxscvi.py
@@ -20,11 +20,11 @@ def test_jax_scvi(n_latent=5):
         batch_key="batch",
     )
     model = JaxSCVI(adata, n_latent=n_latent)
-    model.train(2, train_size=0.5, check_val_every_n_epoch=1, accelerator="cpu")
+    model.train(2, train_size=0.5, check_val_every_n_epoch=1)
     model.get_latent_representation()
 
     model = JaxSCVI(adata, n_latent=n_latent, gene_likelihood="poisson")
-    model.train(1, train_size=0.5, accelerator="cpu")
+    model.train(1, train_size=0.5)
     z1 = model.get_latent_representation(give_mean=True, n_samples=1)
     assert z1.ndim == 2
     z2 = model.get_latent_representation(give_mean=False, n_samples=15)
@@ -47,7 +47,7 @@ def test_jax_scvi_training(n_latent: int = 5, dropout_rate: float = 0.1):
         mock_dropout = mock.Mock()
         mock_dropout.side_effect = lambda h, **_kwargs: h
         mock_dropout_cls.return_value = mock_dropout
-        model.train(1, train_size=0.5, check_val_every_n_epoch=1, accelerator="cpu")
+        model.train(1, train_size=0.5, check_val_every_n_epoch=1)
 
         assert not model.module.training
         mock_dropout_cls.assert_called()
@@ -65,7 +65,7 @@ def test_jax_scvi_save_load(save_path: str, n_latent: int = 5):
         batch_key="batch",
     )
     model = JaxSCVI(adata, n_latent=n_latent)
-    model.train(2, train_size=0.5, check_val_every_n_epoch=1, accelerator="cpu")
+    model.train(2, train_size=0.5, check_val_every_n_epoch=1)
     z1 = model.get_latent_representation(adata)
     model.save(save_path, overwrite=True, save_anndata=True)
     model.view_setup_args(save_path)
diff --git a/tests/model/test_scvi.py b/tests/model/test_scvi.py
index 44e15ddfcc..acbee48228 100644
--- a/tests/model/test_scvi.py
+++ b/tests/model/test_scvi.py
@@ -1,7 +1,6 @@
 import inspect
 import os
 import pickle
-import subprocess
 import tarfile
 from unittest import mock
 
@@ -1300,55 +1299,55 @@ def test_scvi_num_workers():
     model.get_normalized_expression(n_samples=2)
 
 
-def test_scvi_train_ddp(save_path: str):
-    training_code = """
-import torch
-import scvi
-from scvi.model import SCVI
-
-adata = scvi.data.synthetic_iid()
-SCVI.setup_anndata(adata)
-
-model = SCVI(adata)
-
-model.train(
-    max_epochs=1,
-    check_val_every_n_epoch=1,
-    accelerator="gpu",
-    devices=-1,
-    strategy="ddp_find_unused_parameters_true",
-)
-
-torch.distributed.destroy_process_group()
-
-assert model.is_trained
-"""
-
-    if torch.cuda.is_available():
-        # Define the file path for the temporary script in the current working directory
-        temp_file_path = os.path.join(save_path, "train_scvi_ddp_temp.py")
-
-        # Write the training code to the file in the current working directory
-        with open(temp_file_path, "w") as temp_file:
-            temp_file.write(training_code)
-            print(f"Temporary Python file created at: {temp_file_path}")
-
-        def launch_ddp(world_size, temp_file_path):
-            # Command to run the script via torchrun
-            command = [
-                "torchrun",
-                "--nproc_per_node=" + str(world_size),  # Specify the number of GPUs
-                temp_file_path,  # Your original script
-            ]
-            # Use subprocess to run the command
-            try:
-                # Run the command, wait for it to finish & clean up the temporary file
-                subprocess.run(command, check=True)
-            except subprocess.CalledProcessError as e:
-                os.remove(temp_file_path)
-                print(f"Error occurred while running the DDP training: {e}")
-                raise
-            finally:
-                os.remove(temp_file_path)
-
-        launch_ddp(torch.cuda.device_count(), temp_file_path)
+# def test_scvi_train_ddp(save_path: str):
+#     training_code = """
+# import torch
+# import scvi
+# from scvi.model import SCVI
+#
+# adata = scvi.data.synthetic_iid()
+# SCVI.setup_anndata(adata)
+#
+# model = SCVI(adata)
+#
+# model.train(
+#     max_epochs=1,
+#     check_val_every_n_epoch=1,
+#     accelerator="gpu",
+#     devices=-1,
+#     strategy="ddp_find_unused_parameters_true",
+# )
+#
+# torch.distributed.destroy_process_group()
+#
+# assert model.is_trained
+# """
+#
+#     if torch.cuda.is_available():
+#         # Define the file path for the temporary script in the current working directory
+#         temp_file_path = os.path.join(save_path, "train_scvi_ddp_temp.py")
+#
+#         # Write the training code to the file in the current working directory
+#         with open(temp_file_path, "w") as temp_file:
+#             temp_file.write(training_code)
+#             print(f"Temporary Python file created at: {temp_file_path}")
+#
+#         def launch_ddp(world_size, temp_file_path):
+#             # Command to run the script via torchrun
+#             command = [
+#                 "torchrun",
+#                 "--nproc_per_node=" + str(world_size),  # Specify the number of GPUs
+#                 temp_file_path,  # Your original script
+#             ]
+#             # Use subprocess to run the command
+#             try:
+#                 # Run the command, wait for it to finish & clean up the temporary file
+#                 subprocess.run(command, check=True)
+#             except subprocess.CalledProcessError as e:
+#                 os.remove(temp_file_path)
+#                 print(f"Error occurred while running the DDP training: {e}")
+#                 raise
+#             finally:
+#                 os.remove(temp_file_path)
+#
+#         launch_ddp(torch.cuda.device_count(), temp_file_path)

From 58c629d90377d6ee162097e0d622ea503db3888e Mon Sep 17 00:00:00 2001
From: Ori Kronfeld <ori.kronfeld@weizmann.ac.il>
Date: Wed, 4 Dec 2024 00:27:40 +0200
Subject: [PATCH 31/42] revert fixes

---
 .github/workflows/test_linux_cuda.yml   |  2 +-
 src/scvi/dataloaders/_ann_dataloader.py |  8 ++++----
 src/scvi/dataloaders/_samplers.py       | 14 +++++++-------
 src/scvi/model/_scanvi.py               |  1 -
 tests/model/test_jaxscvi.py             |  4 ----
 tests/model/test_scvi.py                |  2 +-
 6 files changed, 13 insertions(+), 18 deletions(-)

diff --git a/.github/workflows/test_linux_cuda.yml b/.github/workflows/test_linux_cuda.yml
index d0bc164c9f..0eca4d65c8 100644
--- a/.github/workflows/test_linux_cuda.yml
+++ b/.github/workflows/test_linux_cuda.yml
@@ -59,7 +59,7 @@ jobs:
         run: |
           python -m pip install --upgrade pip wheel uv
           python -m uv pip install --system "scvi-tools[tests] @ ."
-          python -m pip install jax[cuda12]
+          python -m pip install jax[cuda]
           python -m pip install nvidia-nccl-cu12
 
       - name: Run pytest
diff --git a/src/scvi/dataloaders/_ann_dataloader.py b/src/scvi/dataloaders/_ann_dataloader.py
index 82a13ac0b6..eca803c581 100644
--- a/src/scvi/dataloaders/_ann_dataloader.py
+++ b/src/scvi/dataloaders/_ann_dataloader.py
@@ -117,10 +117,6 @@ def __init__(
                     batch_size=batch_size,
                     drop_last=drop_last,
                 )
-                # do not touch batch size here, sampler gives batched indices
-                # This disables PyTorch automatic batching, which is necessary
-                # for fast access to sparse matrices
-                self.kwargs.update({"batch_size": None, "shuffle": False})
             else:
                 sampler = BatchDistributedSampler(
                     self.dataset,
@@ -130,6 +126,10 @@ def __init__(
                     shuffle=shuffle,
                     **kwargs,
                 )
+            # do not touch batch size here, sampler gives batched indices
+            # This disables PyTorch automatic batching, which is necessary
+            # for fast access to sparse matrices
+            self.kwargs.update({"batch_size": None, "shuffle": False})
 
         self.kwargs.update({"sampler": sampler})
 
diff --git a/src/scvi/dataloaders/_samplers.py b/src/scvi/dataloaders/_samplers.py
index 8753c6feec..667f27d699 100644
--- a/src/scvi/dataloaders/_samplers.py
+++ b/src/scvi/dataloaders/_samplers.py
@@ -36,13 +36,13 @@ def __init__(
         drop_dataset_tail: bool = False,
         **kwargs,
     ):
-        for redundant_key in [
-            "pin_memory",
-            "num_workers",
-            "persistent_workers",
-        ]:
-            if redundant_key in kwargs:
-                kwargs.pop(redundant_key)
+        # for redundant_key in [
+        #     "pin_memory",
+        #     "num_workers",
+        #     "persistent_workers",
+        # ]:
+        #     if redundant_key in kwargs:
+        #         kwargs.pop(redundant_key)
 
         super().__init__(dataset, drop_last=drop_dataset_tail, **kwargs)
         self.batch_size = batch_size
diff --git a/src/scvi/model/_scanvi.py b/src/scvi/model/_scanvi.py
index edd75e5324..084d83be1f 100644
--- a/src/scvi/model/_scanvi.py
+++ b/src/scvi/model/_scanvi.py
@@ -411,7 +411,6 @@ def train(
             shuffle_set_split=shuffle_set_split,
             n_samples_per_label=n_samples_per_label,
             batch_size=batch_size,
-            # distributed_sampler=use_distributed_sampler(trainer_kwargs.get("strategy", None)),
             **datasplitter_kwargs,
         )
         training_plan = self._training_plan_cls(self.module, self.n_labels, **plan_kwargs)
diff --git a/tests/model/test_jaxscvi.py b/tests/model/test_jaxscvi.py
index 1d6ee6b313..63e50136cb 100644
--- a/tests/model/test_jaxscvi.py
+++ b/tests/model/test_jaxscvi.py
@@ -12,8 +12,6 @@
 
 
 def test_jax_scvi(n_latent=5):
-    # accelerator = "gpu" if torch.cuda.is_available() else "cpu"
-
     adata = synthetic_iid()
     JaxSCVI.setup_anndata(
         adata,
@@ -33,7 +31,6 @@ def test_jax_scvi(n_latent=5):
 
 
 def test_jax_scvi_training(n_latent: int = 5, dropout_rate: float = 0.1):
-    # accelerator = "gpu" if torch.cuda.is_available() else "cpu"
     adata = synthetic_iid()
     JaxSCVI.setup_anndata(
         adata,
@@ -58,7 +55,6 @@ def test_jax_scvi_training(n_latent: int = 5, dropout_rate: float = 0.1):
 
 
 def test_jax_scvi_save_load(save_path: str, n_latent: int = 5):
-    # accelerator = "gpu" if torch.cuda.is_available() else "cpu"
     adata = synthetic_iid()
     JaxSCVI.setup_anndata(
         adata,
diff --git a/tests/model/test_scvi.py b/tests/model/test_scvi.py
index acbee48228..af8e5e8579 100644
--- a/tests/model/test_scvi.py
+++ b/tests/model/test_scvi.py
@@ -1322,7 +1322,7 @@ def test_scvi_num_workers():
 #
 # assert model.is_trained
 # """
-#
+#     import subprocess
 #     if torch.cuda.is_available():
 #         # Define the file path for the temporary script in the current working directory
 #         temp_file_path = os.path.join(save_path, "train_scvi_ddp_temp.py")

From ba6c9f0b2608dbc251bf731a640a94b80914945d Mon Sep 17 00:00:00 2001
From: Ori Kronfeld <ori.kronfeld@weizmann.ac.il>
Date: Wed, 4 Dec 2024 00:43:34 +0200
Subject: [PATCH 32/42] revert fixes

---
 src/scvi/dataloaders/_ann_dataloader.py | 2 +-
 tests/model/test_jaxscvi.py             | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/scvi/dataloaders/_ann_dataloader.py b/src/scvi/dataloaders/_ann_dataloader.py
index eca803c581..04bbae6b6c 100644
--- a/src/scvi/dataloaders/_ann_dataloader.py
+++ b/src/scvi/dataloaders/_ann_dataloader.py
@@ -136,4 +136,4 @@ def __init__(
         if iter_ndarray:
             self.kwargs.update({"collate_fn": lambda x: x})
 
-        super().__init__(self.dataset, drop_last=drop_dataset_tail, **self.kwargs)
+        super().__init__(self.dataset, **self.kwargs)
diff --git a/tests/model/test_jaxscvi.py b/tests/model/test_jaxscvi.py
index 63e50136cb..dac25b7c89 100644
--- a/tests/model/test_jaxscvi.py
+++ b/tests/model/test_jaxscvi.py
@@ -2,8 +2,6 @@
 
 import numpy as np
 import pytest
-
-# import torch
 from flax import linen as nn
 
 from scvi.data import synthetic_iid

From dd8641d0bf6e1bf5ae42538f167b5004fd479ee7 Mon Sep 17 00:00:00 2001
From: Ori Kronfeld <ori.kronfeld@weizmann.ac.il>
Date: Wed, 4 Dec 2024 15:27:19 +0200
Subject: [PATCH 33/42] more fixes

---
 src/scvi/dataloaders/_ann_dataloader.py    |  2 +-
 src/scvi/dataloaders/_concat_dataloader.py | 13 +++++++------
 tests/dataloaders/test_dataloaders.py      |  2 ++
 tests/model/test_scanvi.py                 |  9 +++------
 4 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/scvi/dataloaders/_ann_dataloader.py b/src/scvi/dataloaders/_ann_dataloader.py
index 04bbae6b6c..7860184786 100644
--- a/src/scvi/dataloaders/_ann_dataloader.py
+++ b/src/scvi/dataloaders/_ann_dataloader.py
@@ -124,7 +124,7 @@ def __init__(
                     drop_last=drop_last,
                     drop_dataset_tail=drop_dataset_tail,
                     shuffle=shuffle,
-                    **kwargs,
+                    # **kwargs,
                 )
             # do not touch batch size here, sampler gives batched indices
             # This disables PyTorch automatic batching, which is necessary
diff --git a/src/scvi/dataloaders/_concat_dataloader.py b/src/scvi/dataloaders/_concat_dataloader.py
index 554f0267f0..1d041758db 100644
--- a/src/scvi/dataloaders/_concat_dataloader.py
+++ b/src/scvi/dataloaders/_concat_dataloader.py
@@ -53,11 +53,12 @@ def __init__(
         self._shuffle = shuffle
         self._batch_size = batch_size
         self._drop_last = drop_last
-        self._drop_dataset_tail = (
-            self.dataloader_kwargs["drop_dataset_tail"]
-            if "drop_dataset_tail" in self.dataloader_kwargs.keys()
-            else False
-        )
+        self._distributed_sampler = distributed_sampler
+        # self._drop_dataset_tail = (
+        #     self.dataloader_kwargs["drop_dataset_tail"]
+        #     if "drop_dataset_tail" in self.dataloader_kwargs.keys()
+        #     else False
+        # )
 
         self.dataloaders = []
         for indices in indices_list:
@@ -75,7 +76,7 @@ def __init__(
             )
         lens = [len(dl) for dl in self.dataloaders]
         self.largest_dl = self.dataloaders[np.argmax(lens)]
-        super().__init__(self.largest_dl, drop_last=self._drop_dataset_tail, **data_loader_kwargs)
+        super().__init__(self.largest_dl, **data_loader_kwargs)
 
     def __len__(self):
         return len(self.largest_dl)
diff --git a/tests/dataloaders/test_dataloaders.py b/tests/dataloaders/test_dataloaders.py
index 0c6bec9804..8b320b7aeb 100644
--- a/tests/dataloaders/test_dataloaders.py
+++ b/tests/dataloaders/test_dataloaders.py
@@ -164,3 +164,5 @@ def test_scanvi_with_distributed_sampler(num_processes: int, save_path: str):
         )
 
         model.train(1, datasplitter_kwargs=datasplitter_kwargs)
+
+        torch.distributed.destroy_process_group()
diff --git a/tests/model/test_scanvi.py b/tests/model/test_scanvi.py
index 7d8fced0d2..11349da290 100644
--- a/tests/model/test_scanvi.py
+++ b/tests/model/test_scanvi.py
@@ -580,7 +580,7 @@ def check_no_logits_and_softmax(model: SCANVI):
     check_no_logits_and_softmax(model)
 
 
-# def test_scanvi_train_ddp():
+# def test_scanvi_train_ddp(save_path: str):
 #     training_code = """
 # import torch
 # import scvi
@@ -609,13 +609,10 @@ def check_no_logits_and_softmax(model: SCANVI):
 #
 # assert model.is_trained
 # """
-#
+#     import subprocess
 #     if torch.cuda.is_available():
-#         # Get the current working directory (CWD)
-#         cwd = os.getcwd()
-#
 #         # Define the file path for the temporary script in the current working directory
-#         temp_file_path = os.path.join(cwd, "train_scanvi_ddp_temp.py")
+#         temp_file_path = os.path.join(save_path, "train_scanvi_ddp_temp.py")
 #
 #         # Write the training code to the file in the current working directory
 #         with open(temp_file_path, "w") as temp_file:

From bbf00231bc5c03edf730b08b3d4293148a3f0257 Mon Sep 17 00:00:00 2001
From: Ori Kronfeld <ori.kronfeld@weizmann.ac.il>
Date: Wed, 4 Dec 2024 18:40:45 +0200
Subject: [PATCH 34/42] more fixes

---
 tests/model/test_scvi.py | 105 ++++++++++++++++++++-------------------
 1 file changed, 53 insertions(+), 52 deletions(-)

diff --git a/tests/model/test_scvi.py b/tests/model/test_scvi.py
index af8e5e8579..28e56b47d6 100644
--- a/tests/model/test_scvi.py
+++ b/tests/model/test_scvi.py
@@ -1299,55 +1299,56 @@ def test_scvi_num_workers():
     model.get_normalized_expression(n_samples=2)
 
 
-# def test_scvi_train_ddp(save_path: str):
-#     training_code = """
-# import torch
-# import scvi
-# from scvi.model import SCVI
-#
-# adata = scvi.data.synthetic_iid()
-# SCVI.setup_anndata(adata)
-#
-# model = SCVI(adata)
-#
-# model.train(
-#     max_epochs=1,
-#     check_val_every_n_epoch=1,
-#     accelerator="gpu",
-#     devices=-1,
-#     strategy="ddp_find_unused_parameters_true",
-# )
-#
-# torch.distributed.destroy_process_group()
-#
-# assert model.is_trained
-# """
-#     import subprocess
-#     if torch.cuda.is_available():
-#         # Define the file path for the temporary script in the current working directory
-#         temp_file_path = os.path.join(save_path, "train_scvi_ddp_temp.py")
-#
-#         # Write the training code to the file in the current working directory
-#         with open(temp_file_path, "w") as temp_file:
-#             temp_file.write(training_code)
-#             print(f"Temporary Python file created at: {temp_file_path}")
-#
-#         def launch_ddp(world_size, temp_file_path):
-#             # Command to run the script via torchrun
-#             command = [
-#                 "torchrun",
-#                 "--nproc_per_node=" + str(world_size),  # Specify the number of GPUs
-#                 temp_file_path,  # Your original script
-#             ]
-#             # Use subprocess to run the command
-#             try:
-#                 # Run the command, wait for it to finish & clean up the temporary file
-#                 subprocess.run(command, check=True)
-#             except subprocess.CalledProcessError as e:
-#                 os.remove(temp_file_path)
-#                 print(f"Error occurred while running the DDP training: {e}")
-#                 raise
-#             finally:
-#                 os.remove(temp_file_path)
-#
-#         launch_ddp(torch.cuda.device_count(), temp_file_path)
+def test_scvi_train_ddp(save_path: str):
+    training_code = """
+import torch
+import scvi
+from scvi.model import SCVI
+
+adata = scvi.data.synthetic_iid()
+SCVI.setup_anndata(adata)
+
+model = SCVI(adata)
+
+model.train(
+    max_epochs=1,
+    check_val_every_n_epoch=1,
+    accelerator="gpu",
+    devices=-1,
+    strategy="ddp_find_unused_parameters_true",
+)
+
+torch.distributed.destroy_process_group()
+
+assert model.is_trained
+"""
+    import subprocess
+
+    if torch.cuda.is_available():
+        # Define the file path for the temporary script in the current working directory
+        temp_file_path = os.path.join(save_path, "train_scvi_ddp_temp.py")
+
+        # Write the training code to the file in the current working directory
+        with open(temp_file_path, "w") as temp_file:
+            temp_file.write(training_code)
+            print(f"Temporary Python file created at: {temp_file_path}")
+
+        def launch_ddp(world_size, temp_file_path):
+            # Command to run the script via torchrun
+            command = [
+                "torchrun",
+                "--nproc_per_node=" + str(world_size),  # Specify the number of GPUs
+                temp_file_path,  # Your original script
+            ]
+            # Use subprocess to run the command
+            try:
+                # Run the command, wait for it to finish & clean up the temporary file
+                subprocess.run(command, check=True)
+            except subprocess.CalledProcessError as e:
+                os.remove(temp_file_path)
+                print(f"Error occurred while running the DDP training: {e}")
+                raise
+            finally:
+                os.remove(temp_file_path)
+
+        launch_ddp(torch.cuda.device_count(), temp_file_path)

From 3dc4b334cf9c1520a2ea3d8f58d8bceb5b48e5d9 Mon Sep 17 00:00:00 2001
From: Ori Kronfeld <ori.kronfeld@weizmann.ac.il>
Date: Wed, 4 Dec 2024 18:58:10 +0200
Subject: [PATCH 35/42] revert the test which fail on  runner

---
 tests/model/test_scvi.py | 105 +++++++++++++++++++--------------------
 1 file changed, 52 insertions(+), 53 deletions(-)

diff --git a/tests/model/test_scvi.py b/tests/model/test_scvi.py
index 28e56b47d6..af8e5e8579 100644
--- a/tests/model/test_scvi.py
+++ b/tests/model/test_scvi.py
@@ -1299,56 +1299,55 @@ def test_scvi_num_workers():
     model.get_normalized_expression(n_samples=2)
 
 
-def test_scvi_train_ddp(save_path: str):
-    training_code = """
-import torch
-import scvi
-from scvi.model import SCVI
-
-adata = scvi.data.synthetic_iid()
-SCVI.setup_anndata(adata)
-
-model = SCVI(adata)
-
-model.train(
-    max_epochs=1,
-    check_val_every_n_epoch=1,
-    accelerator="gpu",
-    devices=-1,
-    strategy="ddp_find_unused_parameters_true",
-)
-
-torch.distributed.destroy_process_group()
-
-assert model.is_trained
-"""
-    import subprocess
-
-    if torch.cuda.is_available():
-        # Define the file path for the temporary script in the current working directory
-        temp_file_path = os.path.join(save_path, "train_scvi_ddp_temp.py")
-
-        # Write the training code to the file in the current working directory
-        with open(temp_file_path, "w") as temp_file:
-            temp_file.write(training_code)
-            print(f"Temporary Python file created at: {temp_file_path}")
-
-        def launch_ddp(world_size, temp_file_path):
-            # Command to run the script via torchrun
-            command = [
-                "torchrun",
-                "--nproc_per_node=" + str(world_size),  # Specify the number of GPUs
-                temp_file_path,  # Your original script
-            ]
-            # Use subprocess to run the command
-            try:
-                # Run the command, wait for it to finish & clean up the temporary file
-                subprocess.run(command, check=True)
-            except subprocess.CalledProcessError as e:
-                os.remove(temp_file_path)
-                print(f"Error occurred while running the DDP training: {e}")
-                raise
-            finally:
-                os.remove(temp_file_path)
-
-        launch_ddp(torch.cuda.device_count(), temp_file_path)
+# def test_scvi_train_ddp(save_path: str):
+#     training_code = """
+# import torch
+# import scvi
+# from scvi.model import SCVI
+#
+# adata = scvi.data.synthetic_iid()
+# SCVI.setup_anndata(adata)
+#
+# model = SCVI(adata)
+#
+# model.train(
+#     max_epochs=1,
+#     check_val_every_n_epoch=1,
+#     accelerator="gpu",
+#     devices=-1,
+#     strategy="ddp_find_unused_parameters_true",
+# )
+#
+# torch.distributed.destroy_process_group()
+#
+# assert model.is_trained
+# """
+#     import subprocess
+#     if torch.cuda.is_available():
+#         # Define the file path for the temporary script in the current working directory
+#         temp_file_path = os.path.join(save_path, "train_scvi_ddp_temp.py")
+#
+#         # Write the training code to the file in the current working directory
+#         with open(temp_file_path, "w") as temp_file:
+#             temp_file.write(training_code)
+#             print(f"Temporary Python file created at: {temp_file_path}")
+#
+#         def launch_ddp(world_size, temp_file_path):
+#             # Command to run the script via torchrun
+#             command = [
+#                 "torchrun",
+#                 "--nproc_per_node=" + str(world_size),  # Specify the number of GPUs
+#                 temp_file_path,  # Your original script
+#             ]
+#             # Use subprocess to run the command
+#             try:
+#                 # Run the command, wait for it to finish & clean up the temporary file
+#                 subprocess.run(command, check=True)
+#             except subprocess.CalledProcessError as e:
+#                 os.remove(temp_file_path)
+#                 print(f"Error occurred while running the DDP training: {e}")
+#                 raise
+#             finally:
+#                 os.remove(temp_file_path)
+#
+#         launch_ddp(torch.cuda.device_count(), temp_file_path)

From 0a35c19e4d7b9bdfae9767d7939452ef1e776f87 Mon Sep 17 00:00:00 2001
From: Ori Kronfeld <ori.kronfeld@weizmann.ac.il>
Date: Sun, 8 Dec 2024 15:01:26 +0200
Subject: [PATCH 36/42] moved ddp test to a new file

---
 tests/dataloaders/test_dataloaders.py |   2 +-
 tests/model/test_multigpu.py          | 111 ++++++++++++++++++++++++++
 tests/model/test_scanvi.py            |  60 --------------
 tests/model/test_scvi.py              |  54 -------------
 4 files changed, 112 insertions(+), 115 deletions(-)
 create mode 100644 tests/model/test_multigpu.py

diff --git a/tests/dataloaders/test_dataloaders.py b/tests/dataloaders/test_dataloaders.py
index 8b320b7aeb..c9fd30f29c 100644
--- a/tests/dataloaders/test_dataloaders.py
+++ b/tests/dataloaders/test_dataloaders.py
@@ -103,7 +103,7 @@ def multiprocessing_worker(
 ):
     # initializes the distributed backend that takes care of synchronizing processes
     torch.distributed.init_process_group(
-        "nccl",  # backend that works on all systems
+        "nccl",
         init_method=f"file://{save_path}/dist_file",
         rank=rank,
         world_size=world_size,
diff --git a/tests/model/test_multigpu.py b/tests/model/test_multigpu.py
new file mode 100644
index 0000000000..929caa9dc8
--- /dev/null
+++ b/tests/model/test_multigpu.py
@@ -0,0 +1,111 @@
+# def test_scvi_train_ddp(save_path: str):
+#     training_code = """
+# import torch
+# import scvi
+# from scvi.model import SCVI
+#
+# adata = scvi.data.synthetic_iid()
+# SCVI.setup_anndata(adata)
+#
+# model = SCVI(adata)
+#
+# model.train(
+#     max_epochs=1,
+#     check_val_every_n_epoch=1,
+#     accelerator="gpu",
+#     devices=-1,
+#     strategy="ddp_find_unused_parameters_true",
+# )
+#
+# torch.distributed.destroy_process_group()
+#
+# assert model.is_trained
+# """
+#     import subprocess
+#     if torch.cuda.is_available():
+#         # Define the file path for the temporary script in the current working directory
+#         temp_file_path = os.path.join(save_path, "train_scvi_ddp_temp.py")
+#
+#         # Write the training code to the file in the current working directory
+#         with open(temp_file_path, "w") as temp_file:
+#             temp_file.write(training_code)
+#             print(f"Temporary Python file created at: {temp_file_path}")
+#
+#         def launch_ddp(world_size, temp_file_path):
+#             # Command to run the script via torchrun
+#             command = [
+#                 "torchrun",
+#                 "--nproc_per_node=" + str(world_size),  # Specify the number of GPUs
+#                 temp_file_path,  # Your original script
+#             ]
+#             # Use subprocess to run the command
+#             try:
+#                 # Run the command, wait for it to finish & clean up the temporary file
+#                 subprocess.run(command, check=True)
+#             except subprocess.CalledProcessError as e:
+#                 os.remove(temp_file_path)
+#                 print(f"Error occurred while running the DDP training: {e}")
+#                 raise
+#             finally:
+#                 os.remove(temp_file_path)
+#
+#         launch_ddp(torch.cuda.device_count(), temp_file_path)
+#
+# def test_scanvi_train_ddp(save_path: str):
+#     training_code = """
+# import torch
+# import scvi
+# from scvi.model import SCANVI
+#
+# adata = scvi.data.synthetic_iid()
+# SCANVI.setup_anndata(
+#     adata,
+#     "labels",
+#     "label_0",
+#     batch_key="batch",
+# )
+#
+# model = SCANVI(adata, n_latent=10)
+#
+# model.train(
+#     max_epochs=100,
+#     train_size=0.5,
+#     check_val_every_n_epoch=1,
+#     accelerator="gpu",
+#     devices=-1,
+#     strategy="ddp_find_unused_parameters_true",
+# )
+#
+# torch.distributed.destroy_process_group()
+#
+# assert model.is_trained
+# """
+#     import subprocess
+#     if torch.cuda.is_available():
+#         # Define the file path for the temporary script in the current working directory
+#         temp_file_path = os.path.join(save_path, "train_scanvi_ddp_temp.py")
+#
+#         # Write the training code to the file in the current working directory
+#         with open(temp_file_path, "w") as temp_file:
+#             temp_file.write(training_code)
+#             print(f"Temporary Python file created at: {temp_file_path}")
+#
+#         def launch_ddp(world_size, temp_file_path):
+#             # Command to run the script via torchrun
+#             command = [
+#                 "torchrun",
+#                 "--nproc_per_node=" + str(world_size),  # Specify the number of GPUs
+#                 temp_file_path,  # Your original script
+#             ]
+#             # Use subprocess to run the command
+#             try:
+#                 # Run the command, wait for it to finish & clean up the temporary file
+#                 subprocess.run(command, check=True)
+#             except subprocess.CalledProcessError as e:
+#                 os.remove(temp_file_path)
+#                 print(f"Error occurred while running the DDP training: {e}")
+#                 raise
+#             finally:
+#                 os.remove(temp_file_path)
+#
+#         launch_ddp(torch.cuda.device_count(), temp_file_path)
diff --git a/tests/model/test_scanvi.py b/tests/model/test_scanvi.py
index 11349da290..aede994029 100644
--- a/tests/model/test_scanvi.py
+++ b/tests/model/test_scanvi.py
@@ -578,63 +578,3 @@ def check_no_logits_and_softmax(model: SCANVI):
 
     model = SCANVI.load(resave_model_path, adata)
     check_no_logits_and_softmax(model)
-
-
-# def test_scanvi_train_ddp(save_path: str):
-#     training_code = """
-# import torch
-# import scvi
-# from scvi.model import SCANVI
-#
-# adata = scvi.data.synthetic_iid()
-# SCANVI.setup_anndata(
-#     adata,
-#     "labels",
-#     "label_0",
-#     batch_key="batch",
-# )
-#
-# model = SCANVI(adata, n_latent=10)
-#
-# model.train(
-#     max_epochs=100,
-#     train_size=0.5,
-#     check_val_every_n_epoch=1,
-#     accelerator="gpu",
-#     devices=-1,
-#     strategy="ddp_find_unused_parameters_true",
-# )
-#
-# torch.distributed.destroy_process_group()
-#
-# assert model.is_trained
-# """
-#     import subprocess
-#     if torch.cuda.is_available():
-#         # Define the file path for the temporary script in the current working directory
-#         temp_file_path = os.path.join(save_path, "train_scanvi_ddp_temp.py")
-#
-#         # Write the training code to the file in the current working directory
-#         with open(temp_file_path, "w") as temp_file:
-#             temp_file.write(training_code)
-#             print(f"Temporary Python file created at: {temp_file_path}")
-#
-#         def launch_ddp(world_size, temp_file_path):
-#             # Command to run the script via torchrun
-#             command = [
-#                 "torchrun",
-#                 "--nproc_per_node=" + str(world_size),  # Specify the number of GPUs
-#                 temp_file_path,  # Your original script
-#             ]
-#             # Use subprocess to run the command
-#             try:
-#                 # Run the command, wait for it to finish & clean up the temporary file
-#                 subprocess.run(command, check=True)
-#             except subprocess.CalledProcessError as e:
-#                 os.remove(temp_file_path)
-#                 print(f"Error occurred while running the DDP training: {e}")
-#                 raise
-#             finally:
-#                 os.remove(temp_file_path)
-#
-#         launch_ddp(torch.cuda.device_count(), temp_file_path)
diff --git a/tests/model/test_scvi.py b/tests/model/test_scvi.py
index af8e5e8579..49fe18e531 100644
--- a/tests/model/test_scvi.py
+++ b/tests/model/test_scvi.py
@@ -1297,57 +1297,3 @@ def test_scvi_num_workers():
     model.get_reconstruction_error()
     model.get_normalized_expression(transform_batch="batch_1")
     model.get_normalized_expression(n_samples=2)
-
-
-# def test_scvi_train_ddp(save_path: str):
-#     training_code = """
-# import torch
-# import scvi
-# from scvi.model import SCVI
-#
-# adata = scvi.data.synthetic_iid()
-# SCVI.setup_anndata(adata)
-#
-# model = SCVI(adata)
-#
-# model.train(
-#     max_epochs=1,
-#     check_val_every_n_epoch=1,
-#     accelerator="gpu",
-#     devices=-1,
-#     strategy="ddp_find_unused_parameters_true",
-# )
-#
-# torch.distributed.destroy_process_group()
-#
-# assert model.is_trained
-# """
-#     import subprocess
-#     if torch.cuda.is_available():
-#         # Define the file path for the temporary script in the current working directory
-#         temp_file_path = os.path.join(save_path, "train_scvi_ddp_temp.py")
-#
-#         # Write the training code to the file in the current working directory
-#         with open(temp_file_path, "w") as temp_file:
-#             temp_file.write(training_code)
-#             print(f"Temporary Python file created at: {temp_file_path}")
-#
-#         def launch_ddp(world_size, temp_file_path):
-#             # Command to run the script via torchrun
-#             command = [
-#                 "torchrun",
-#                 "--nproc_per_node=" + str(world_size),  # Specify the number of GPUs
-#                 temp_file_path,  # Your original script
-#             ]
-#             # Use subprocess to run the command
-#             try:
-#                 # Run the command, wait for it to finish & clean up the temporary file
-#                 subprocess.run(command, check=True)
-#             except subprocess.CalledProcessError as e:
-#                 os.remove(temp_file_path)
-#                 print(f"Error occurred while running the DDP training: {e}")
-#                 raise
-#             finally:
-#                 os.remove(temp_file_path)
-#
-#         launch_ddp(torch.cuda.device_count(), temp_file_path)

From f52bdd50a4244266522d7bbb07be1cc7b6a9d192 Mon Sep 17 00:00:00 2001
From: Ori Kronfeld <ori.kronfeld@weizmann.ac.il>
Date: Mon, 9 Dec 2024 12:13:20 +0200
Subject: [PATCH 37/42] Added a multi GPU test flag

---
 .github/workflows/test_linux_multigpu.yml  |  80 ++++++++
 src/scvi/dataloaders/_ann_dataloader.py    |   1 -
 src/scvi/dataloaders/_concat_dataloader.py |   5 -
 src/scvi/dataloaders/_samplers.py          |   8 -
 tests/conftest.py                          |  23 ++-
 tests/dataloaders/test_dataloaders.py      |  88 ++++----
 tests/model/test_multigpu.py               | 228 +++++++++++----------
 7 files changed, 261 insertions(+), 172 deletions(-)
 create mode 100644 .github/workflows/test_linux_multigpu.yml

diff --git a/.github/workflows/test_linux_multigpu.yml b/.github/workflows/test_linux_multigpu.yml
new file mode 100644
index 0000000000..103b49180f
--- /dev/null
+++ b/.github/workflows/test_linux_multigpu.yml
@@ -0,0 +1,80 @@
+name: test (multi-GPU)
+
+on:
+  pull_request:
+    branches: [main, "[0-9]+.[0-9]+.x"]
+    types: [labeled, synchronize, opened]
+  schedule:
+    - cron: "0 10 * * *" # runs at 10:00 UTC (03:00 PST) every day
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  test:
+    # if PR has label "multiGPU tests" or "all tests" or if scheduled or manually triggered or on push
+    if: >-
+      (
+        contains(github.event.pull_request.labels.*.name, 'multiGPU tests') ||
+        contains(github.event.pull_request.labels.*.name, 'all tests') ||
+        contains(github.event_name, 'schedule') ||
+        contains(github.event_name, 'workflow_dispatch') ||
+        contains(github.event_name, 'push')
+      )
+
+    runs-on: [self-hosted, Linux, X64, CUDA]
+
+    defaults:
+      run:
+        shell: bash -e {0} # -e to fail on error
+
+    container:
+      image: ghcr.io/scverse/scvi-tools:py3.12-cu12-base
+      options: --user root --gpus all --pull always
+
+    #    strategy:
+    #      fail-fast: false
+    #      matrix:
+    #        os: [ubuntu-latest]
+    #        python: ["3.12"]
+
+    permissions:
+      id-token: write
+
+    name: unit
+
+    env:
+      OS: ${{ matrix.os }}
+      PYTHON: ${{ matrix.python }}
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python }}
+          cache: "pip"
+          cache-dependency-path: "**/pyproject.toml"
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip wheel uv
+          python -m uv pip install --system "scvi-tools[tests] @ ."
+          python -m pip install jax[cuda]
+          python -m pip install nvidia-nccl-cu12
+
+      - name: Run pytest
+        env:
+          MPLBACKEND: agg
+          PLATFORM: ${{ matrix.os }}
+          DISPLAY: :42
+          COLUMNS: 120
+        run: |
+          coverage run -m pytest -v --color=yes --multigpu-tests --accelerator cuda --devices auto
+          coverage report
+
+      - uses: codecov/codecov-action@v4
+        with:
+          token: ${{ secrets.CODECOV_TOKEN }}
diff --git a/src/scvi/dataloaders/_ann_dataloader.py b/src/scvi/dataloaders/_ann_dataloader.py
index 7860184786..27e17302d5 100644
--- a/src/scvi/dataloaders/_ann_dataloader.py
+++ b/src/scvi/dataloaders/_ann_dataloader.py
@@ -124,7 +124,6 @@ def __init__(
                     drop_last=drop_last,
                     drop_dataset_tail=drop_dataset_tail,
                     shuffle=shuffle,
-                    # **kwargs,
                 )
             # do not touch batch size here, sampler gives batched indices
             # This disables PyTorch automatic batching, which is necessary
diff --git a/src/scvi/dataloaders/_concat_dataloader.py b/src/scvi/dataloaders/_concat_dataloader.py
index 1d041758db..fdcdea4aa8 100644
--- a/src/scvi/dataloaders/_concat_dataloader.py
+++ b/src/scvi/dataloaders/_concat_dataloader.py
@@ -54,11 +54,6 @@ def __init__(
         self._batch_size = batch_size
         self._drop_last = drop_last
         self._distributed_sampler = distributed_sampler
-        # self._drop_dataset_tail = (
-        #     self.dataloader_kwargs["drop_dataset_tail"]
-        #     if "drop_dataset_tail" in self.dataloader_kwargs.keys()
-        #     else False
-        # )
 
         self.dataloaders = []
         for indices in indices_list:
diff --git a/src/scvi/dataloaders/_samplers.py b/src/scvi/dataloaders/_samplers.py
index 667f27d699..283b2586e2 100644
--- a/src/scvi/dataloaders/_samplers.py
+++ b/src/scvi/dataloaders/_samplers.py
@@ -36,14 +36,6 @@ def __init__(
         drop_dataset_tail: bool = False,
         **kwargs,
     ):
-        # for redundant_key in [
-        #     "pin_memory",
-        #     "num_workers",
-        #     "persistent_workers",
-        # ]:
-        #     if redundant_key in kwargs:
-        #         kwargs.pop(redundant_key)
-
         super().__init__(dataset, drop_last=drop_dataset_tail, **kwargs)
         self.batch_size = batch_size
         self.drop_last_batch = drop_last  # drop_last already defined in parent
diff --git a/tests/conftest.py b/tests/conftest.py
index aac511cefc..3a6e942c7a 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,7 +1,6 @@
 import shutil
 
 import pytest
-from distutils.dir_util import copy_tree
 
 import scvi
 from tests.data.utils import generic_setup_adata_manager
@@ -15,6 +14,12 @@ def pytest_addoption(parser):
         default=False,
         help="Run tests that retrieve stuff from the internet. This increases test time.",
     )
+    parser.addoption(
+        "--multigpu-tests",
+        action="store_true",
+        default=False,
+        help="Run tests that are desinged for multiGPU.",
+    )
     parser.addoption(
         "--optional",
         action="store_true",
@@ -62,7 +67,7 @@ def pytest_collection_modifyitems(config, items):
         # `--internet-tests` passed
         if not run_internet and ("internet" in item.keywords):
             item.add_marker(skip_internet)
-        # Skip all tests not marked with `pytest.mark.internet` if `--internet` passed
+        # Skip all tests not marked with `pytest.mark.internet` if `--internet-tests` passed
         elif run_internet and ("internet" not in item.keywords):
             item.add_marker(skip_non_internet)
 
@@ -90,13 +95,25 @@ def pytest_collection_modifyitems(config, items):
         elif run_private and ("private" not in item.keywords):
             item.add_marker(skip_non_private)
 
+    run_multigpu = config.getoption("--multigpu-tests")
+    skip_multigpu = pytest.mark.skip(reason="need --multigpu-tests option to run")
+    skip_non_multigpu = pytest.mark.skip(reason="test not having a pytest.mark.multigpu decorator")
+    for item in items:
+        # All tests marked with `pytest.mark.multigpu` get skipped unless
+        # `--multigpu-tests` passed
+        if not run_multigpu and ("multigpu" in item.keywords):
+            item.add_marker(skip_multigpu)
+        # Skip all tests not marked with `pytest.mark.multigpu` if `--multigpu-tests` passed
+        elif run_multigpu and ("multigpu" not in item.keywords):
+            item.add_marker(skip_non_multigpu)
+
 
 @pytest.fixture(scope="session")
 def save_path(tmp_path_factory):
     """Docstring for save_path."""
     dir = tmp_path_factory.mktemp("temp_data", numbered=False)
     path = str(dir)
-    copy_tree("tests/test_data", path)
+    shutil.copy_tree("tests/test_data", path)
     yield path + "/"
     shutil.rmtree(str(tmp_path_factory.getbasetemp()))
 
diff --git a/tests/dataloaders/test_dataloaders.py b/tests/dataloaders/test_dataloaders.py
index c9fd30f29c..c1e96c6786 100644
--- a/tests/dataloaders/test_dataloaders.py
+++ b/tests/dataloaders/test_dataloaders.py
@@ -115,54 +115,54 @@ def multiprocessing_worker(
     return
 
 
+@pytest.mark.multigpu
 @pytest.mark.parametrize("num_processes", [1, 2])
 def test_anndataloader_distributed_sampler(num_processes: int, save_path: str):
-    if torch.cuda.is_available():
-        adata = scvi.data.synthetic_iid()
-        manager = generic_setup_adata_manager(adata)
-
-        file_path = save_path + "/dist_file"
-        if os.path.exists(file_path):  # Check if the file exists
-            os.remove(file_path)
-
-        torch.multiprocessing.spawn(
-            multiprocessing_worker,
-            args=(num_processes, manager, save_path, {}),
-            nprocs=num_processes,
-            join=True,
-        )
+    adata = scvi.data.synthetic_iid()
+    manager = generic_setup_adata_manager(adata)
 
+    file_path = save_path + "/dist_file"
+    if os.path.exists(file_path):  # Check if the file exists
+        os.remove(file_path)
+
+    torch.multiprocessing.spawn(
+        multiprocessing_worker,
+        args=(num_processes, manager, save_path, {}),
+        nprocs=num_processes,
+        join=True,
+    )
 
-@pytest.mark.parametrize("num_processes", [1])
+
+@pytest.mark.multigpu
+@pytest.mark.parametrize("num_processes", [1, 2])
 def test_scanvi_with_distributed_sampler(num_processes: int, save_path: str):
-    if torch.cuda.is_available():
-        adata = scvi.data.synthetic_iid()
-        SCANVI.setup_anndata(
-            adata,
-            "labels",
-            "label_0",
-            batch_key="batch",
-        )
-        file_path = save_path + "/dist_file"
-        if os.path.exists(file_path):  # Check if the file exists
-            os.remove(file_path)
-        datasplitter_kwargs = {}
-        # Multi-GPU settings
-        datasplitter_kwargs["distributed_sampler"] = True
-        datasplitter_kwargs["drop_last"] = False
-        if num_processes == 1:
-            datasplitter_kwargs["distributed_sampler"] = False
-        model = SCANVI(adata, n_latent=10)
-
-        # initializes the distributed backend that takes care of synchronizing processes
-        torch.distributed.init_process_group(
-            "nccl",  # backend that works on all systems
-            init_method=f"file://{save_path}/dist_file",
-            rank=0,
-            world_size=num_processes,
-            store=None,
-        )
+    adata = scvi.data.synthetic_iid()
+    SCANVI.setup_anndata(
+        adata,
+        "labels",
+        "label_0",
+        batch_key="batch",
+    )
+    file_path = save_path + "/dist_file"
+    if os.path.exists(file_path):  # Check if the file exists
+        os.remove(file_path)
+    datasplitter_kwargs = {}
+    # Multi-GPU settings
+    datasplitter_kwargs["distributed_sampler"] = True
+    datasplitter_kwargs["drop_last"] = False
+    if num_processes == 1:
+        datasplitter_kwargs["distributed_sampler"] = False
+    model = SCANVI(adata, n_latent=10)
+
+    # initializes the distributed backend that takes care of synchronizing processes
+    torch.distributed.init_process_group(
+        "nccl",  # backend that works on all systems
+        init_method=f"file://{save_path}/dist_file",
+        rank=0,
+        world_size=num_processes,
+        store=None,
+    )
 
-        model.train(1, datasplitter_kwargs=datasplitter_kwargs)
+    model.train(1, datasplitter_kwargs=datasplitter_kwargs)
 
-        torch.distributed.destroy_process_group()
+    torch.distributed.destroy_process_group()
diff --git a/tests/model/test_multigpu.py b/tests/model/test_multigpu.py
index 929caa9dc8..301502a27e 100644
--- a/tests/model/test_multigpu.py
+++ b/tests/model/test_multigpu.py
@@ -1,111 +1,117 @@
-# def test_scvi_train_ddp(save_path: str):
-#     training_code = """
-# import torch
-# import scvi
-# from scvi.model import SCVI
-#
-# adata = scvi.data.synthetic_iid()
-# SCVI.setup_anndata(adata)
-#
-# model = SCVI(adata)
-#
-# model.train(
-#     max_epochs=1,
-#     check_val_every_n_epoch=1,
-#     accelerator="gpu",
-#     devices=-1,
-#     strategy="ddp_find_unused_parameters_true",
-# )
-#
-# torch.distributed.destroy_process_group()
-#
-# assert model.is_trained
-# """
-#     import subprocess
-#     if torch.cuda.is_available():
-#         # Define the file path for the temporary script in the current working directory
-#         temp_file_path = os.path.join(save_path, "train_scvi_ddp_temp.py")
-#
-#         # Write the training code to the file in the current working directory
-#         with open(temp_file_path, "w") as temp_file:
-#             temp_file.write(training_code)
-#             print(f"Temporary Python file created at: {temp_file_path}")
-#
-#         def launch_ddp(world_size, temp_file_path):
-#             # Command to run the script via torchrun
-#             command = [
-#                 "torchrun",
-#                 "--nproc_per_node=" + str(world_size),  # Specify the number of GPUs
-#                 temp_file_path,  # Your original script
-#             ]
-#             # Use subprocess to run the command
-#             try:
-#                 # Run the command, wait for it to finish & clean up the temporary file
-#                 subprocess.run(command, check=True)
-#             except subprocess.CalledProcessError as e:
-#                 os.remove(temp_file_path)
-#                 print(f"Error occurred while running the DDP training: {e}")
-#                 raise
-#             finally:
-#                 os.remove(temp_file_path)
-#
-#         launch_ddp(torch.cuda.device_count(), temp_file_path)
-#
-# def test_scanvi_train_ddp(save_path: str):
-#     training_code = """
-# import torch
-# import scvi
-# from scvi.model import SCANVI
-#
-# adata = scvi.data.synthetic_iid()
-# SCANVI.setup_anndata(
-#     adata,
-#     "labels",
-#     "label_0",
-#     batch_key="batch",
-# )
-#
-# model = SCANVI(adata, n_latent=10)
-#
-# model.train(
-#     max_epochs=100,
-#     train_size=0.5,
-#     check_val_every_n_epoch=1,
-#     accelerator="gpu",
-#     devices=-1,
-#     strategy="ddp_find_unused_parameters_true",
-# )
-#
-# torch.distributed.destroy_process_group()
-#
-# assert model.is_trained
-# """
-#     import subprocess
-#     if torch.cuda.is_available():
-#         # Define the file path for the temporary script in the current working directory
-#         temp_file_path = os.path.join(save_path, "train_scanvi_ddp_temp.py")
-#
-#         # Write the training code to the file in the current working directory
-#         with open(temp_file_path, "w") as temp_file:
-#             temp_file.write(training_code)
-#             print(f"Temporary Python file created at: {temp_file_path}")
-#
-#         def launch_ddp(world_size, temp_file_path):
-#             # Command to run the script via torchrun
-#             command = [
-#                 "torchrun",
-#                 "--nproc_per_node=" + str(world_size),  # Specify the number of GPUs
-#                 temp_file_path,  # Your original script
-#             ]
-#             # Use subprocess to run the command
-#             try:
-#                 # Run the command, wait for it to finish & clean up the temporary file
-#                 subprocess.run(command, check=True)
-#             except subprocess.CalledProcessError as e:
-#                 os.remove(temp_file_path)
-#                 print(f"Error occurred while running the DDP training: {e}")
-#                 raise
-#             finally:
-#                 os.remove(temp_file_path)
-#
-#         launch_ddp(torch.cuda.device_count(), temp_file_path)
+import os
+import subprocess
+
+import pytest
+import torch
+
+
+@pytest.mark.multigpu
+def test_scvi_train_ddp(save_path: str):
+    training_code = """
+import torch
+import scvi
+from scvi.model import SCVI
+
+adata = scvi.data.synthetic_iid()
+SCVI.setup_anndata(adata)
+
+model = SCVI(adata)
+
+model.train(
+    max_epochs=1,
+    check_val_every_n_epoch=1,
+    accelerator="gpu",
+    devices=-1,
+    strategy="ddp_find_unused_parameters_true",
+)
+
+torch.distributed.destroy_process_group()
+
+assert model.is_trained
+"""
+    # Define the file path for the temporary script in the current working directory
+    temp_file_path = os.path.join(save_path, "train_scvi_ddp_temp.py")
+
+    # Write the training code to the file in the current working directory
+    with open(temp_file_path, "w") as temp_file:
+        temp_file.write(training_code)
+        print(f"Temporary Python file created at: {temp_file_path}")
+
+    def launch_ddp(world_size, temp_file_path):
+        # Command to run the script via torchrun
+        command = [
+            "torchrun",
+            "--nproc_per_node=" + str(world_size),  # Specify the number of GPUs
+            temp_file_path,  # Your original script
+        ]
+        # Use subprocess to run the command
+        try:
+            # Run the command, wait for it to finish & clean up the temporary file
+            subprocess.run(command, check=True)
+        except subprocess.CalledProcessError as e:
+            os.remove(temp_file_path)
+            print(f"Error occurred while running the DDP training: {e}")
+            raise
+        finally:
+            os.remove(temp_file_path)
+
+    launch_ddp(torch.cuda.device_count(), temp_file_path)
+
+
+@pytest.mark.multigpu
+def test_scanvi_train_ddp(save_path: str):
+    training_code = """
+import torch
+import scvi
+from scvi.model import SCANVI
+
+adata = scvi.data.synthetic_iid()
+SCANVI.setup_anndata(
+    adata,
+    "labels",
+    "label_0",
+    batch_key="batch",
+)
+
+model = SCANVI(adata, n_latent=10)
+
+model.train(
+    max_epochs=100,
+    train_size=0.5,
+    check_val_every_n_epoch=1,
+    accelerator="gpu",
+    devices=-1,
+    strategy="ddp_find_unused_parameters_true",
+)
+
+torch.distributed.destroy_process_group()
+
+assert model.is_trained
+"""
+    # Define the file path for the temporary script in the current working directory
+    temp_file_path = os.path.join(save_path, "train_scanvi_ddp_temp.py")
+
+    # Write the training code to the file in the current working directory
+    with open(temp_file_path, "w") as temp_file:
+        temp_file.write(training_code)
+        print(f"Temporary Python file created at: {temp_file_path}")
+
+    def launch_ddp(world_size, temp_file_path):
+        # Command to run the script via torchrun
+        command = [
+            "torchrun",
+            "--nproc_per_node=" + str(world_size),  # Specify the number of GPUs
+            temp_file_path,  # Your original script
+        ]
+        # Use subprocess to run the command
+        try:
+            # Run the command, wait for it to finish & clean up the temporary file
+            subprocess.run(command, check=True)
+        except subprocess.CalledProcessError as e:
+            os.remove(temp_file_path)
+            print(f"Error occurred while running the DDP training: {e}")
+            raise
+        finally:
+            os.remove(temp_file_path)
+
+    launch_ddp(torch.cuda.device_count(), temp_file_path)

From 2641284d2345ae9c47e5e2a1563a47c6f22b0ec7 Mon Sep 17 00:00:00 2001
From: Ori Kronfeld <ori.kronfeld@weizmann.ac.il>
Date: Mon, 9 Dec 2024 12:22:34 +0200
Subject: [PATCH 38/42] Added changelog

---
 CHANGELOG.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index de569f0b19..66df2a0a58 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -27,8 +27,12 @@ to [Semantic Versioning]. Full commit history is available in the
 
 #### Fixed
 
+- Fixed bug in distributed `scvi.dataloaders._concat_dataloader` {pr}`3053`.
+
 #### Changed
 
+- Updated the CI workflow with internet, private, optional and multiGPU tests {pr}`3082`.
+
 #### Removed
 
 ### 1.2.1 (2024-12-04)

From 667895080120c33abf120a9c4a8a024a222a83af Mon Sep 17 00:00:00 2001
From: Ori Kronfeld <ori.kronfeld@weizmann.ac.il>
Date: Mon, 9 Dec 2024 12:28:54 +0200
Subject: [PATCH 39/42] added changelog

---
 CHANGELOG.md | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 66df2a0a58..f8a74e4dcf 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,14 +10,18 @@ to [Semantic Versioning]. Full commit history is available in the
 
 #### Added
 
+- Add {class}`scvi.external.Decipher` for dimensionality reduction and interpretable
+    representation learning in single-cell RNA sequencing data {pr}`3015`.
+
 #### Fixed
 
+- Fixed bug in distributed `scvi.dataloaders._concat_dataloader` {pr}`3053`.
+
 #### Changed
 
-#### Removed
+- Updated the CI workflow with multiGPU tests {pr}`3053`.
 
-- Add {class}`scvi.external.Decipher` for dimensionality reduction and interpretable
-    representation learning in single-cell RNA sequencing data {pr}`3015`.
+#### Removed
 
 ### 1.2.2 (2024-XX-XX)
 
@@ -27,11 +31,9 @@ to [Semantic Versioning]. Full commit history is available in the
 
 #### Fixed
 
-- Fixed bug in distributed `scvi.dataloaders._concat_dataloader` {pr}`3053`.
-
 #### Changed
 
-- Updated the CI workflow with internet, private, optional and multiGPU tests {pr}`3082`.
+- Updated the CI workflow with internet, private and optional tests {pr}`3082`.
 
 #### Removed
 

From 12f4767edb826dfd6dd8ff9e31c5b7f88728250e Mon Sep 17 00:00:00 2001
From: Ori Kronfeld <ori.kronfeld@weizmann.ac.il>
Date: Mon, 9 Dec 2024 12:36:25 +0200
Subject: [PATCH 40/42] fix in multigpu tests

---
 .github/workflows/test_linux_multigpu.yml | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/test_linux_multigpu.yml b/.github/workflows/test_linux_multigpu.yml
index 103b49180f..1b3fa109aa 100644
--- a/.github/workflows/test_linux_multigpu.yml
+++ b/.github/workflows/test_linux_multigpu.yml
@@ -1,6 +1,8 @@
 name: test (multi-GPU)
 
 on:
+  push:
+    branches: [main, "[0-9]+.[0-9]+.x"] #this is new
   pull_request:
     branches: [main, "[0-9]+.[0-9]+.x"]
     types: [labeled, synchronize, opened]
@@ -20,8 +22,7 @@ jobs:
         contains(github.event.pull_request.labels.*.name, 'multiGPU tests') ||
         contains(github.event.pull_request.labels.*.name, 'all tests') ||
         contains(github.event_name, 'schedule') ||
-        contains(github.event_name, 'workflow_dispatch') ||
-        contains(github.event_name, 'push')
+        contains(github.event_name, 'workflow_dispatch')
       )
 
     runs-on: [self-hosted, Linux, X64, CUDA]
@@ -34,16 +35,7 @@ jobs:
       image: ghcr.io/scverse/scvi-tools:py3.12-cu12-base
       options: --user root --gpus all --pull always
 
-    #    strategy:
-    #      fail-fast: false
-    #      matrix:
-    #        os: [ubuntu-latest]
-    #        python: ["3.12"]
-
-    permissions:
-      id-token: write
-
-    name: unit
+    name: integration
 
     env:
       OS: ${{ matrix.os }}

From 8ade2529607348c37c329badac8ba710f41f3d62 Mon Sep 17 00:00:00 2001
From: Ori Kronfeld <ori.kronfeld@weizmann.ac.il>
Date: Mon, 9 Dec 2024 12:46:20 +0200
Subject: [PATCH 41/42] fix in conftest

---
 tests/conftest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 3a6e942c7a..329c88439c 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -113,7 +113,7 @@ def save_path(tmp_path_factory):
     """Docstring for save_path."""
     dir = tmp_path_factory.mktemp("temp_data", numbered=False)
     path = str(dir)
-    shutil.copy_tree("tests/test_data", path)
+    shutil.copytree("tests/test_data", path)
     yield path + "/"
     shutil.rmtree(str(tmp_path_factory.getbasetemp()))
 

From 2f780bb70dcd1348c2eea5cf824241410e8c316a Mon Sep 17 00:00:00 2001
From: Ori Kronfeld <ori.kronfeld@weizmann.ac.il>
Date: Mon, 9 Dec 2024 12:49:56 +0200
Subject: [PATCH 42/42] revert thing with shutil

---
 tests/conftest.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 329c88439c..6ef9467efc 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,6 +1,7 @@
 import shutil
 
 import pytest
+from distutils.dir_util import copy_tree
 
 import scvi
 from tests.data.utils import generic_setup_adata_manager
@@ -113,7 +114,7 @@ def save_path(tmp_path_factory):
     """Docstring for save_path."""
     dir = tmp_path_factory.mktemp("temp_data", numbered=False)
     path = str(dir)
-    shutil.copytree("tests/test_data", path)
+    copy_tree("tests/test_data", path)
     yield path + "/"
     shutil.rmtree(str(tmp_path_factory.getbasetemp()))