From 8932953c5bbb634a98294a1aeec769936bb6591a Mon Sep 17 00:00:00 2001 From: Ori Kronfeld Date: Mon, 25 Nov 2024 18:15:56 +0200 Subject: [PATCH 01/42] Update for the dataloder in case of distributed sampler --- src/scvi/dataloaders/_ann_dataloader.py | 8 ++++ src/scvi/dataloaders/_concat_dataloader.py | 12 ++++++ src/scvi/dataloaders/_samplers.py | 16 +++++++- tests/dataloaders/test_dataloaders.py | 46 ++++++++++++++++++++-- 4 files changed, 77 insertions(+), 5 deletions(-) diff --git a/src/scvi/dataloaders/_ann_dataloader.py b/src/scvi/dataloaders/_ann_dataloader.py index 27e17302d5..b978239569 100644 --- a/src/scvi/dataloaders/_ann_dataloader.py +++ b/src/scvi/dataloaders/_ann_dataloader.py @@ -118,12 +118,17 @@ def __init__( drop_last=drop_last, ) else: + if "save_path" not in kwargs: + kwargs["save_path"] = "/." + if "num_processes" not in kwargs: + kwargs["num_processes"] = 1 sampler = BatchDistributedSampler( self.dataset, batch_size=batch_size, drop_last=drop_last, drop_dataset_tail=drop_dataset_tail, shuffle=shuffle, + **kwargs ) # do not touch batch size here, sampler gives batched indices # This disables PyTorch automatic batching, which is necessary @@ -135,4 +140,7 @@ def __init__( if iter_ndarray: self.kwargs.update({"collate_fn": lambda x: x}) + for redundant_key in ["save_path","num_processes"]: + if redundant_key in self.kwargs: + self.kwargs.pop(redundant_key) super().__init__(self.dataset, **self.kwargs) diff --git a/src/scvi/dataloaders/_concat_dataloader.py b/src/scvi/dataloaders/_concat_dataloader.py index 9aa9071a85..66802128fe 100644 --- a/src/scvi/dataloaders/_concat_dataloader.py +++ b/src/scvi/dataloaders/_concat_dataloader.py @@ -25,6 +25,13 @@ class ConcatDataLoader(DataLoader): Dictionary with keys representing keys in data registry (``adata_manager.data_registry``) and value equal to desired numpy loading type (later made into torch tensor). If ``None``, defaults to all registered data. + drop_last + If `True` and the dataset is not evenly divisible by `batch_size`, the last + incomplete batch is dropped. If `False` and the dataset is not evenly divisible + by `batch_size`, then the last batch will be smaller than `batch_size`. + distributed_sampler + ``EXPERIMENTAL`` Whether to use :class:`~scvi.dataloaders.BatchDistributedSampler` as the + sampler. If `True`, `sampler` must be `None`. data_loader_kwargs Keyword arguments for :class:`~torch.utils.data.DataLoader` """ @@ -37,6 +44,7 @@ def __init__( batch_size: int = 128, data_and_attributes: dict | None = None, drop_last: bool | int = False, + distributed_sampler: bool = False, **data_loader_kwargs, ): self.adata_manager = adata_manager @@ -56,11 +64,15 @@ def __init__( batch_size=batch_size, data_and_attributes=data_and_attributes, drop_last=drop_last, + distributed_sampler=distributed_sampler, **self.dataloader_kwargs, ) ) lens = [len(dl) for dl in self.dataloaders] self.largest_dl = self.dataloaders[np.argmax(lens)] + for redundant_key in ["save_path","num_processes"]: + if redundant_key in data_loader_kwargs: + data_loader_kwargs.pop(redundant_key) super().__init__(self.largest_dl, **data_loader_kwargs) def __len__(self): diff --git a/src/scvi/dataloaders/_samplers.py b/src/scvi/dataloaders/_samplers.py index 283b2586e2..7137bb7406 100644 --- a/src/scvi/dataloaders/_samplers.py +++ b/src/scvi/dataloaders/_samplers.py @@ -1,5 +1,5 @@ from torch.utils.data import Dataset, DistributedSampler - +import torch class BatchDistributedSampler(DistributedSampler): """``EXPERIMENTAL`` Sampler that restricts to loading from a subset of the dataset. @@ -36,6 +36,20 @@ def __init__( drop_dataset_tail: bool = False, **kwargs, ): + + if not torch.distributed.is_initialized(): + # initializes the distributed backend that takes care of synchronizing processes + torch.distributed.init_process_group( + "gloo", # backend that works on all systems + init_method="file://"+kwargs["save_path"]+"/dist_file", + rank=0, + world_size=kwargs["num_processes"], + ) + + for redundant_key in ["save_path","pin_memory","num_processes","num_workers","persistent_workers"]: + if redundant_key in kwargs: + kwargs.pop(redundant_key) + super().__init__(dataset, drop_last=drop_dataset_tail, **kwargs) self.batch_size = batch_size self.drop_last_batch = drop_last # drop_last already defined in parent diff --git a/tests/dataloaders/test_dataloaders.py b/tests/dataloaders/test_dataloaders.py index d03d9ca9f1..c3fc1573f5 100644 --- a/tests/dataloaders/test_dataloaders.py +++ b/tests/dataloaders/test_dataloaders.py @@ -2,10 +2,10 @@ import pytest import torch from tests.data.utils import generic_setup_adata_manager - +import os import scvi from scvi import REGISTRY_KEYS - +from scvi.model import SCANVI class TestSemiSupervisedTrainingPlan(scvi.train.SemiSupervisedTrainingPlan): def __init__(self, *args, **kwargs): @@ -107,14 +107,52 @@ def multiprocessing_worker( return -@pytest.mark.optional -def test_anndataloader_distributed_sampler(save_path: str, num_processes: int = 2): +#@pytest.mark.optional +@pytest.mark.parametrize("num_processes", [1, 2]) +def test_anndataloader_distributed_sampler(num_processes: int, save_path: str): adata = scvi.data.synthetic_iid() manager = generic_setup_adata_manager(adata) + file_path = save_path + "/dist_file" + if os.path.exists(file_path): # Check if the file exists + os.remove(file_path) + torch.multiprocessing.spawn( multiprocessing_worker, args=(num_processes, manager, save_path), nprocs=num_processes, join=True, ) + +#@pytest.mark.optional +@pytest.mark.parametrize("num_processes", [1, 2]) +def test_scanvi_with_distributed_sampler(num_processes: int, save_path: str): + if torch.cuda.is_available(): + adata = scvi.data.synthetic_iid() + manager = generic_setup_adata_manager(adata) + SCANVI.setup_anndata( + adata, + "labels", + "label_0", + batch_key="batch", + ) + file_path = save_path + "/dist_file" + if os.path.exists(file_path): # Check if the file exists + os.remove(file_path) + datasplitter_kwargs = {} + datasplitter_kwargs['distributed_sampler'] = True + if num_processes==1: + datasplitter_kwargs['distributed_sampler'] = False + datasplitter_kwargs['save_path'] = save_path + datasplitter_kwargs['num_processes'] = num_processes + model = SCANVI(adata, n_latent=10) + + torch.multiprocessing.spawn( + multiprocessing_worker, + args=(num_processes, manager, save_path), + nprocs=num_processes, + join=True, + ) + + model.train(1, datasplitter_kwargs=datasplitter_kwargs) + From df60e2796f7046c372d6c6864bb0850f0cc005b0 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 25 Nov 2024 16:18:08 +0000 Subject: [PATCH 02/42] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/scvi/dataloaders/_ann_dataloader.py | 4 ++-- src/scvi/dataloaders/_concat_dataloader.py | 2 +- src/scvi/dataloaders/_samplers.py | 14 ++++++++++---- tests/dataloaders/test_dataloaders.py | 21 ++++++++++++--------- 4 files changed, 25 insertions(+), 16 deletions(-) diff --git a/src/scvi/dataloaders/_ann_dataloader.py b/src/scvi/dataloaders/_ann_dataloader.py index b978239569..2b63886ded 100644 --- a/src/scvi/dataloaders/_ann_dataloader.py +++ b/src/scvi/dataloaders/_ann_dataloader.py @@ -128,7 +128,7 @@ def __init__( drop_last=drop_last, drop_dataset_tail=drop_dataset_tail, shuffle=shuffle, - **kwargs + **kwargs, ) # do not touch batch size here, sampler gives batched indices # This disables PyTorch automatic batching, which is necessary @@ -140,7 +140,7 @@ def __init__( if iter_ndarray: self.kwargs.update({"collate_fn": lambda x: x}) - for redundant_key in ["save_path","num_processes"]: + for redundant_key in ["save_path", "num_processes"]: if redundant_key in self.kwargs: self.kwargs.pop(redundant_key) super().__init__(self.dataset, **self.kwargs) diff --git a/src/scvi/dataloaders/_concat_dataloader.py b/src/scvi/dataloaders/_concat_dataloader.py index 66802128fe..5b368aee23 100644 --- a/src/scvi/dataloaders/_concat_dataloader.py +++ b/src/scvi/dataloaders/_concat_dataloader.py @@ -70,7 +70,7 @@ def __init__( ) lens = [len(dl) for dl in self.dataloaders] self.largest_dl = self.dataloaders[np.argmax(lens)] - for redundant_key in ["save_path","num_processes"]: + for redundant_key in ["save_path", "num_processes"]: if redundant_key in data_loader_kwargs: data_loader_kwargs.pop(redundant_key) super().__init__(self.largest_dl, **data_loader_kwargs) diff --git a/src/scvi/dataloaders/_samplers.py b/src/scvi/dataloaders/_samplers.py index 7137bb7406..bd407d60f2 100644 --- a/src/scvi/dataloaders/_samplers.py +++ b/src/scvi/dataloaders/_samplers.py @@ -1,5 +1,6 @@ -from torch.utils.data import Dataset, DistributedSampler import torch +from torch.utils.data import Dataset, DistributedSampler + class BatchDistributedSampler(DistributedSampler): """``EXPERIMENTAL`` Sampler that restricts to loading from a subset of the dataset. @@ -36,17 +37,22 @@ def __init__( drop_dataset_tail: bool = False, **kwargs, ): - if not torch.distributed.is_initialized(): # initializes the distributed backend that takes care of synchronizing processes torch.distributed.init_process_group( "gloo", # backend that works on all systems - init_method="file://"+kwargs["save_path"]+"/dist_file", + init_method="file://" + kwargs["save_path"] + "/dist_file", rank=0, world_size=kwargs["num_processes"], ) - for redundant_key in ["save_path","pin_memory","num_processes","num_workers","persistent_workers"]: + for redundant_key in [ + "save_path", + "pin_memory", + "num_processes", + "num_workers", + "persistent_workers", + ]: if redundant_key in kwargs: kwargs.pop(redundant_key) diff --git a/tests/dataloaders/test_dataloaders.py b/tests/dataloaders/test_dataloaders.py index c3fc1573f5..2b825279f8 100644 --- a/tests/dataloaders/test_dataloaders.py +++ b/tests/dataloaders/test_dataloaders.py @@ -1,12 +1,15 @@ +import os + import numpy as np import pytest import torch from tests.data.utils import generic_setup_adata_manager -import os + import scvi from scvi import REGISTRY_KEYS from scvi.model import SCANVI + class TestSemiSupervisedTrainingPlan(scvi.train.SemiSupervisedTrainingPlan): def __init__(self, *args, **kwargs): self.n_samples_per_label = kwargs.pop("n_samples_per_label") @@ -107,7 +110,7 @@ def multiprocessing_worker( return -#@pytest.mark.optional +# @pytest.mark.optional @pytest.mark.parametrize("num_processes", [1, 2]) def test_anndataloader_distributed_sampler(num_processes: int, save_path: str): adata = scvi.data.synthetic_iid() @@ -124,7 +127,8 @@ def test_anndataloader_distributed_sampler(num_processes: int, save_path: str): join=True, ) -#@pytest.mark.optional + +# @pytest.mark.optional @pytest.mark.parametrize("num_processes", [1, 2]) def test_scanvi_with_distributed_sampler(num_processes: int, save_path: str): if torch.cuda.is_available(): @@ -140,11 +144,11 @@ def test_scanvi_with_distributed_sampler(num_processes: int, save_path: str): if os.path.exists(file_path): # Check if the file exists os.remove(file_path) datasplitter_kwargs = {} - datasplitter_kwargs['distributed_sampler'] = True - if num_processes==1: - datasplitter_kwargs['distributed_sampler'] = False - datasplitter_kwargs['save_path'] = save_path - datasplitter_kwargs['num_processes'] = num_processes + datasplitter_kwargs["distributed_sampler"] = True + if num_processes == 1: + datasplitter_kwargs["distributed_sampler"] = False + datasplitter_kwargs["save_path"] = save_path + datasplitter_kwargs["num_processes"] = num_processes model = SCANVI(adata, n_latent=10) torch.multiprocessing.spawn( @@ -155,4 +159,3 @@ def test_scanvi_with_distributed_sampler(num_processes: int, save_path: str): ) model.train(1, datasplitter_kwargs=datasplitter_kwargs) - From 353d4449dd784c91981d2e3dda8d8f43125ec32b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 28 Nov 2024 09:08:56 +0000 Subject: [PATCH 03/42] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 228cad01e2..058dd5d37b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -30,7 +30,7 @@ to [Semantic Versioning]. Full commit history is available in the - Implemented variance of ZINB distribution. {pr}`3044`. - Add {class}`scvi.external.METHYLVI` for modeling methylation data from single-cell bisulfite sequencing (scBS-seq) experiments {pr}`2834`. - + #### Fixed - Breaking Change: Fix `get_outlier_cell_sample_pairs` function in {class}`scvi.external.MRVI` From 1e17d7f0ae892455ff74c6769ecb393376ccb65f Mon Sep 17 00:00:00 2001 From: Ori Kronfeld Date: Thu, 28 Nov 2024 12:56:49 +0200 Subject: [PATCH 04/42] fix test_samplers --- tests/dataloaders/test_dataloaders.py | 54 +++++++++++++-------------- tests/dataloaders/test_samplers.py | 28 ++++++++++++++ 2 files changed, 55 insertions(+), 27 deletions(-) diff --git a/tests/dataloaders/test_dataloaders.py b/tests/dataloaders/test_dataloaders.py index 2b825279f8..3a06ac5af5 100644 --- a/tests/dataloaders/test_dataloaders.py +++ b/tests/dataloaders/test_dataloaders.py @@ -131,31 +131,31 @@ def test_anndataloader_distributed_sampler(num_processes: int, save_path: str): # @pytest.mark.optional @pytest.mark.parametrize("num_processes", [1, 2]) def test_scanvi_with_distributed_sampler(num_processes: int, save_path: str): - if torch.cuda.is_available(): - adata = scvi.data.synthetic_iid() - manager = generic_setup_adata_manager(adata) - SCANVI.setup_anndata( - adata, - "labels", - "label_0", - batch_key="batch", - ) - file_path = save_path + "/dist_file" - if os.path.exists(file_path): # Check if the file exists - os.remove(file_path) - datasplitter_kwargs = {} - datasplitter_kwargs["distributed_sampler"] = True - if num_processes == 1: - datasplitter_kwargs["distributed_sampler"] = False - datasplitter_kwargs["save_path"] = save_path - datasplitter_kwargs["num_processes"] = num_processes - model = SCANVI(adata, n_latent=10) - - torch.multiprocessing.spawn( - multiprocessing_worker, - args=(num_processes, manager, save_path), - nprocs=num_processes, - join=True, - ) + #if torch.cuda.is_available(): + adata = scvi.data.synthetic_iid() + manager = generic_setup_adata_manager(adata) + SCANVI.setup_anndata( + adata, + "labels", + "label_0", + batch_key="batch", + ) + file_path = save_path + "/dist_file" + if os.path.exists(file_path): # Check if the file exists + os.remove(file_path) + datasplitter_kwargs = {} + datasplitter_kwargs["distributed_sampler"] = True + if num_processes == 1: + datasplitter_kwargs["distributed_sampler"] = False + datasplitter_kwargs["save_path"] = save_path + datasplitter_kwargs["num_processes"] = num_processes + model = SCANVI(adata, n_latent=10) + + torch.multiprocessing.spawn( + multiprocessing_worker, + args=(num_processes, manager, save_path), + nprocs=num_processes, + join=True, + ) - model.train(1, datasplitter_kwargs=datasplitter_kwargs) + model.train(1, datasplitter_kwargs=datasplitter_kwargs) diff --git a/tests/dataloaders/test_samplers.py b/tests/dataloaders/test_samplers.py index ae4861d98f..58d04ed67f 100644 --- a/tests/dataloaders/test_samplers.py +++ b/tests/dataloaders/test_samplers.py @@ -1,6 +1,7 @@ from math import ceil, floor import numpy as np +import os import pytest from tests.data.utils import generic_setup_adata_manager @@ -8,7 +9,10 @@ from scvi.dataloaders import BatchDistributedSampler +@pytest.mark.parametrize("num_processes", [1, 2]) def test_batchdistributedsampler_init( + num_processes: int, + save_path: str, batch_size: int = 128, n_batches: int = 2, ): @@ -16,6 +20,10 @@ def test_batchdistributedsampler_init( manager = generic_setup_adata_manager(adata) dataset = manager.create_torch_dataset() + file_path = save_path + "/dist_file" + if os.path.exists(file_path): # Check if the file exists + os.remove(file_path) + sampler = BatchDistributedSampler( dataset, num_replicas=1, @@ -24,6 +32,8 @@ def test_batchdistributedsampler_init( shuffle=True, drop_last=True, drop_dataset_tail=True, + num_processes=num_processes, + save_path=save_path ) assert sampler.batch_size == batch_size assert sampler.rank == 0 @@ -35,9 +45,12 @@ def test_batchdistributedsampler_init( @pytest.mark.parametrize("drop_last", [True, False]) @pytest.mark.parametrize("drop_dataset_tail", [True, False]) +@pytest.mark.parametrize("num_processes", [1, 2]) def test_batchdistributedsampler_drop_last( + num_processes: int, drop_last: bool, drop_dataset_tail: bool, + save_path: str, batch_size: int = 128, n_batches: int = 3, num_replicas: int = 2, @@ -101,6 +114,10 @@ def check_samplers(samplers: list, sampler_batch_size: int): assert len(all_indices) == effective_n_obs_per_sampler assert [len(indices) for indices in batch_indices] == batch_sizes + file_path = save_path + "/dist_file" + if os.path.exists(file_path): # Check if the file exists + os.remove(file_path) + for sampler_batch_size in [batch_size, batch_size - 1, batch_size + 1]: samplers = [ BatchDistributedSampler( @@ -110,13 +127,18 @@ def check_samplers(samplers: list, sampler_batch_size: int): batch_size=sampler_batch_size, drop_last=drop_last, drop_dataset_tail=drop_dataset_tail, + num_processes=num_processes, + save_path=save_path ) for i in range(num_replicas) ] check_samplers(samplers, sampler_batch_size) +@pytest.mark.parametrize("num_processes", [1, 2]) def test_batchdistributedsampler_indices( + num_processes: int, + save_path: str, batch_size: int = 128, n_batches: int = 3, num_replicas: int = 2, @@ -125,12 +147,18 @@ def test_batchdistributedsampler_indices( manager = generic_setup_adata_manager(adata) dataset = manager.create_torch_dataset() + file_path = save_path + "/dist_file" + if os.path.exists(file_path): # Check if the file exists + os.remove(file_path) + samplers = [ BatchDistributedSampler( dataset, num_replicas=num_replicas, rank=i, batch_size=batch_size, + num_processes=num_processes, + save_path=save_path ) for i in range(num_replicas) ] From d63337ab0852d0429fbd03d8b030cfd607063810 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 28 Nov 2024 10:57:05 +0000 Subject: [PATCH 05/42] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/dataloaders/test_dataloaders.py | 2 +- tests/dataloaders/test_samplers.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/dataloaders/test_dataloaders.py b/tests/dataloaders/test_dataloaders.py index 3a06ac5af5..82e07b1b26 100644 --- a/tests/dataloaders/test_dataloaders.py +++ b/tests/dataloaders/test_dataloaders.py @@ -131,7 +131,7 @@ def test_anndataloader_distributed_sampler(num_processes: int, save_path: str): # @pytest.mark.optional @pytest.mark.parametrize("num_processes", [1, 2]) def test_scanvi_with_distributed_sampler(num_processes: int, save_path: str): - #if torch.cuda.is_available(): + # if torch.cuda.is_available(): adata = scvi.data.synthetic_iid() manager = generic_setup_adata_manager(adata) SCANVI.setup_anndata( diff --git a/tests/dataloaders/test_samplers.py b/tests/dataloaders/test_samplers.py index 58d04ed67f..09a8a5a127 100644 --- a/tests/dataloaders/test_samplers.py +++ b/tests/dataloaders/test_samplers.py @@ -1,7 +1,7 @@ +import os from math import ceil, floor import numpy as np -import os import pytest from tests.data.utils import generic_setup_adata_manager @@ -33,7 +33,7 @@ def test_batchdistributedsampler_init( drop_last=True, drop_dataset_tail=True, num_processes=num_processes, - save_path=save_path + save_path=save_path, ) assert sampler.batch_size == batch_size assert sampler.rank == 0 @@ -128,7 +128,7 @@ def check_samplers(samplers: list, sampler_batch_size: int): drop_last=drop_last, drop_dataset_tail=drop_dataset_tail, num_processes=num_processes, - save_path=save_path + save_path=save_path, ) for i in range(num_replicas) ] @@ -158,7 +158,7 @@ def test_batchdistributedsampler_indices( rank=i, batch_size=batch_size, num_processes=num_processes, - save_path=save_path + save_path=save_path, ) for i in range(num_replicas) ] From ca0daf0345b10f6630eefd33abb261e1b9d97f7e Mon Sep 17 00:00:00 2001 From: Ori Kronfeld Date: Thu, 28 Nov 2024 13:11:40 +0200 Subject: [PATCH 06/42] fix test_samplers --- tests/dataloaders/test_dataloaders.py | 54 +++++++++++++-------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/tests/dataloaders/test_dataloaders.py b/tests/dataloaders/test_dataloaders.py index 3a06ac5af5..2b825279f8 100644 --- a/tests/dataloaders/test_dataloaders.py +++ b/tests/dataloaders/test_dataloaders.py @@ -131,31 +131,31 @@ def test_anndataloader_distributed_sampler(num_processes: int, save_path: str): # @pytest.mark.optional @pytest.mark.parametrize("num_processes", [1, 2]) def test_scanvi_with_distributed_sampler(num_processes: int, save_path: str): - #if torch.cuda.is_available(): - adata = scvi.data.synthetic_iid() - manager = generic_setup_adata_manager(adata) - SCANVI.setup_anndata( - adata, - "labels", - "label_0", - batch_key="batch", - ) - file_path = save_path + "/dist_file" - if os.path.exists(file_path): # Check if the file exists - os.remove(file_path) - datasplitter_kwargs = {} - datasplitter_kwargs["distributed_sampler"] = True - if num_processes == 1: - datasplitter_kwargs["distributed_sampler"] = False - datasplitter_kwargs["save_path"] = save_path - datasplitter_kwargs["num_processes"] = num_processes - model = SCANVI(adata, n_latent=10) - - torch.multiprocessing.spawn( - multiprocessing_worker, - args=(num_processes, manager, save_path), - nprocs=num_processes, - join=True, - ) + if torch.cuda.is_available(): + adata = scvi.data.synthetic_iid() + manager = generic_setup_adata_manager(adata) + SCANVI.setup_anndata( + adata, + "labels", + "label_0", + batch_key="batch", + ) + file_path = save_path + "/dist_file" + if os.path.exists(file_path): # Check if the file exists + os.remove(file_path) + datasplitter_kwargs = {} + datasplitter_kwargs["distributed_sampler"] = True + if num_processes == 1: + datasplitter_kwargs["distributed_sampler"] = False + datasplitter_kwargs["save_path"] = save_path + datasplitter_kwargs["num_processes"] = num_processes + model = SCANVI(adata, n_latent=10) + + torch.multiprocessing.spawn( + multiprocessing_worker, + args=(num_processes, manager, save_path), + nprocs=num_processes, + join=True, + ) - model.train(1, datasplitter_kwargs=datasplitter_kwargs) + model.train(1, datasplitter_kwargs=datasplitter_kwargs) From fe99eabd70a160f2013a28c8aa81ddb8978c54aa Mon Sep 17 00:00:00 2001 From: Ori Kronfeld Date: Thu, 28 Nov 2024 16:03:35 +0200 Subject: [PATCH 07/42] Changed to nvidia nccl and fix drop last batch parameter for evenly batches between gpus --- src/scvi/dataloaders/_ann_dataloader.py | 10 +++++----- src/scvi/dataloaders/_concat_dataloader.py | 6 ++++-- src/scvi/dataloaders/_samplers.py | 3 ++- src/scvi/nn/_base_components.py | 3 ++- tests/dataloaders/test_dataloaders.py | 21 +++++++++++++-------- tests/dataloaders/test_samplers.py | 8 ++++---- 6 files changed, 30 insertions(+), 21 deletions(-) diff --git a/src/scvi/dataloaders/_ann_dataloader.py b/src/scvi/dataloaders/_ann_dataloader.py index 2b63886ded..ee3336ce09 100644 --- a/src/scvi/dataloaders/_ann_dataloader.py +++ b/src/scvi/dataloaders/_ann_dataloader.py @@ -117,6 +117,10 @@ def __init__( batch_size=batch_size, drop_last=drop_last, ) + # do not touch batch size here, sampler gives batched indices + # This disables PyTorch automatic batching, which is necessary + # for fast access to sparse matrices + self.kwargs.update({"batch_size": None, "shuffle": False}) else: if "save_path" not in kwargs: kwargs["save_path"] = "/." @@ -130,10 +134,6 @@ def __init__( shuffle=shuffle, **kwargs, ) - # do not touch batch size here, sampler gives batched indices - # This disables PyTorch automatic batching, which is necessary - # for fast access to sparse matrices - self.kwargs.update({"batch_size": None, "shuffle": False}) self.kwargs.update({"sampler": sampler}) @@ -143,4 +143,4 @@ def __init__( for redundant_key in ["save_path", "num_processes"]: if redundant_key in self.kwargs: self.kwargs.pop(redundant_key) - super().__init__(self.dataset, **self.kwargs) + super().__init__(self.dataset, drop_last=drop_dataset_tail, **self.kwargs) diff --git a/src/scvi/dataloaders/_concat_dataloader.py b/src/scvi/dataloaders/_concat_dataloader.py index 5b368aee23..a1c4616f56 100644 --- a/src/scvi/dataloaders/_concat_dataloader.py +++ b/src/scvi/dataloaders/_concat_dataloader.py @@ -53,6 +53,8 @@ def __init__( self._shuffle = shuffle self._batch_size = batch_size self._drop_last = drop_last + self._drop_dataset_tail = self.dataloader_kwargs["drop_dataset_tail"] \ + if "drop_dataset_tail" in self.dataloader_kwargs.keys() else False self.dataloaders = [] for indices in indices_list: @@ -70,10 +72,10 @@ def __init__( ) lens = [len(dl) for dl in self.dataloaders] self.largest_dl = self.dataloaders[np.argmax(lens)] - for redundant_key in ["save_path", "num_processes"]: + for redundant_key in ["save_path", "num_processes","drop_dataset_tail"]: if redundant_key in data_loader_kwargs: data_loader_kwargs.pop(redundant_key) - super().__init__(self.largest_dl, **data_loader_kwargs) + super().__init__(self.largest_dl, drop_last=self._drop_dataset_tail, **data_loader_kwargs) def __len__(self): return len(self.largest_dl) diff --git a/src/scvi/dataloaders/_samplers.py b/src/scvi/dataloaders/_samplers.py index bd407d60f2..3e62bc9b3c 100644 --- a/src/scvi/dataloaders/_samplers.py +++ b/src/scvi/dataloaders/_samplers.py @@ -40,10 +40,11 @@ def __init__( if not torch.distributed.is_initialized(): # initializes the distributed backend that takes care of synchronizing processes torch.distributed.init_process_group( - "gloo", # backend that works on all systems + "nccl", # backend that works on all systems init_method="file://" + kwargs["save_path"] + "/dist_file", rank=0, world_size=kwargs["num_processes"], + store=None ) for redundant_key in [ diff --git a/src/scvi/nn/_base_components.py b/src/scvi/nn/_base_components.py index 59b537a321..f632aff18d 100644 --- a/src/scvi/nn/_base_components.py +++ b/src/scvi/nn/_base_components.py @@ -175,7 +175,8 @@ def forward(self, x: torch.Tensor, *cat_list: int): if isinstance(layer, nn.Linear) and self.inject_into_layer(i): if x.dim() == 3: one_hot_cat_list_layer = [ - o.unsqueeze(0).expand((x.size(0), o.size(0), o.size(1))) + o.unsqueeze(0).expand((x.size(0), o.size(0), o.size(1))) if o.dim()==2 + else o[0].unsqueeze(0).expand((x.size(1), o.size(1), o.size(2))) for o in one_hot_cat_list ] else: diff --git a/tests/dataloaders/test_dataloaders.py b/tests/dataloaders/test_dataloaders.py index 2b825279f8..b3b22abda2 100644 --- a/tests/dataloaders/test_dataloaders.py +++ b/tests/dataloaders/test_dataloaders.py @@ -95,17 +95,18 @@ def test_anndataloader_distributed_sampler_init(): def multiprocessing_worker( - rank: int, world_size: int, manager: scvi.data.AnnDataManager, save_path: str + rank: int, world_size: int, manager: scvi.data.AnnDataManager, save_path: str, datasplitter_kwargs ): # initializes the distributed backend that takes care of synchronizing processes torch.distributed.init_process_group( - "gloo", # backend that works on all systems + "nccl", # backend that works on all systems init_method=f"file://{save_path}/dist_file", rank=rank, world_size=world_size, + store = None ) - _ = scvi.dataloaders.AnnDataLoader(manager, distributed_sampler=True) + _ = scvi.dataloaders.AnnDataLoader(manager, **datasplitter_kwargs) return @@ -122,14 +123,14 @@ def test_anndataloader_distributed_sampler(num_processes: int, save_path: str): torch.multiprocessing.spawn( multiprocessing_worker, - args=(num_processes, manager, save_path), + args=(num_processes, manager, save_path, {}), nprocs=num_processes, join=True, ) # @pytest.mark.optional -@pytest.mark.parametrize("num_processes", [1, 2]) +@pytest.mark.parametrize("num_processes", [2]) def test_scanvi_with_distributed_sampler(num_processes: int, save_path: str): if torch.cuda.is_available(): adata = scvi.data.synthetic_iid() @@ -144,16 +145,20 @@ def test_scanvi_with_distributed_sampler(num_processes: int, save_path: str): if os.path.exists(file_path): # Check if the file exists os.remove(file_path) datasplitter_kwargs = {} + #Multi-GPU settings datasplitter_kwargs["distributed_sampler"] = True - if num_processes == 1: - datasplitter_kwargs["distributed_sampler"] = False datasplitter_kwargs["save_path"] = save_path datasplitter_kwargs["num_processes"] = num_processes + datasplitter_kwargs["drop_dataset_tail"] = True + datasplitter_kwargs["drop_last"] = False + if num_processes == 1: + datasplitter_kwargs["distributed_sampler"] = False + datasplitter_kwargs["drop_dataset_tail"] = False model = SCANVI(adata, n_latent=10) torch.multiprocessing.spawn( multiprocessing_worker, - args=(num_processes, manager, save_path), + args=(num_processes, manager, save_path, {}), nprocs=num_processes, join=True, ) diff --git a/tests/dataloaders/test_samplers.py b/tests/dataloaders/test_samplers.py index 09a8a5a127..58d04ed67f 100644 --- a/tests/dataloaders/test_samplers.py +++ b/tests/dataloaders/test_samplers.py @@ -1,7 +1,7 @@ -import os from math import ceil, floor import numpy as np +import os import pytest from tests.data.utils import generic_setup_adata_manager @@ -33,7 +33,7 @@ def test_batchdistributedsampler_init( drop_last=True, drop_dataset_tail=True, num_processes=num_processes, - save_path=save_path, + save_path=save_path ) assert sampler.batch_size == batch_size assert sampler.rank == 0 @@ -128,7 +128,7 @@ def check_samplers(samplers: list, sampler_batch_size: int): drop_last=drop_last, drop_dataset_tail=drop_dataset_tail, num_processes=num_processes, - save_path=save_path, + save_path=save_path ) for i in range(num_replicas) ] @@ -158,7 +158,7 @@ def test_batchdistributedsampler_indices( rank=i, batch_size=batch_size, num_processes=num_processes, - save_path=save_path, + save_path=save_path ) for i in range(num_replicas) ] From 1e2386bf46f3ba54f425fb2930748f3e91381a8c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 28 Nov 2024 14:04:59 +0000 Subject: [PATCH 08/42] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/scvi/dataloaders/_concat_dataloader.py | 9 ++++++--- src/scvi/dataloaders/_samplers.py | 2 +- src/scvi/nn/_base_components.py | 7 +++++-- tests/dataloaders/test_dataloaders.py | 10 +++++++--- tests/dataloaders/test_samplers.py | 8 ++++---- 5 files changed, 23 insertions(+), 13 deletions(-) diff --git a/src/scvi/dataloaders/_concat_dataloader.py b/src/scvi/dataloaders/_concat_dataloader.py index a1c4616f56..9d35e2fb47 100644 --- a/src/scvi/dataloaders/_concat_dataloader.py +++ b/src/scvi/dataloaders/_concat_dataloader.py @@ -53,8 +53,11 @@ def __init__( self._shuffle = shuffle self._batch_size = batch_size self._drop_last = drop_last - self._drop_dataset_tail = self.dataloader_kwargs["drop_dataset_tail"] \ - if "drop_dataset_tail" in self.dataloader_kwargs.keys() else False + self._drop_dataset_tail = ( + self.dataloader_kwargs["drop_dataset_tail"] + if "drop_dataset_tail" in self.dataloader_kwargs.keys() + else False + ) self.dataloaders = [] for indices in indices_list: @@ -72,7 +75,7 @@ def __init__( ) lens = [len(dl) for dl in self.dataloaders] self.largest_dl = self.dataloaders[np.argmax(lens)] - for redundant_key in ["save_path", "num_processes","drop_dataset_tail"]: + for redundant_key in ["save_path", "num_processes", "drop_dataset_tail"]: if redundant_key in data_loader_kwargs: data_loader_kwargs.pop(redundant_key) super().__init__(self.largest_dl, drop_last=self._drop_dataset_tail, **data_loader_kwargs) diff --git a/src/scvi/dataloaders/_samplers.py b/src/scvi/dataloaders/_samplers.py index 3e62bc9b3c..0d7b246940 100644 --- a/src/scvi/dataloaders/_samplers.py +++ b/src/scvi/dataloaders/_samplers.py @@ -44,7 +44,7 @@ def __init__( init_method="file://" + kwargs["save_path"] + "/dist_file", rank=0, world_size=kwargs["num_processes"], - store=None + store=None, ) for redundant_key in [ diff --git a/src/scvi/nn/_base_components.py b/src/scvi/nn/_base_components.py index f632aff18d..75cf438da5 100644 --- a/src/scvi/nn/_base_components.py +++ b/src/scvi/nn/_base_components.py @@ -175,8 +175,11 @@ def forward(self, x: torch.Tensor, *cat_list: int): if isinstance(layer, nn.Linear) and self.inject_into_layer(i): if x.dim() == 3: one_hot_cat_list_layer = [ - o.unsqueeze(0).expand((x.size(0), o.size(0), o.size(1))) if o.dim()==2 - else o[0].unsqueeze(0).expand((x.size(1), o.size(1), o.size(2))) + o.unsqueeze(0).expand((x.size(0), o.size(0), o.size(1))) + if o.dim() == 2 + else o[0] + .unsqueeze(0) + .expand((x.size(1), o.size(1), o.size(2))) for o in one_hot_cat_list ] else: diff --git a/tests/dataloaders/test_dataloaders.py b/tests/dataloaders/test_dataloaders.py index b3b22abda2..95925e0992 100644 --- a/tests/dataloaders/test_dataloaders.py +++ b/tests/dataloaders/test_dataloaders.py @@ -95,7 +95,11 @@ def test_anndataloader_distributed_sampler_init(): def multiprocessing_worker( - rank: int, world_size: int, manager: scvi.data.AnnDataManager, save_path: str, datasplitter_kwargs + rank: int, + world_size: int, + manager: scvi.data.AnnDataManager, + save_path: str, + datasplitter_kwargs, ): # initializes the distributed backend that takes care of synchronizing processes torch.distributed.init_process_group( @@ -103,7 +107,7 @@ def multiprocessing_worker( init_method=f"file://{save_path}/dist_file", rank=rank, world_size=world_size, - store = None + store=None, ) _ = scvi.dataloaders.AnnDataLoader(manager, **datasplitter_kwargs) @@ -145,7 +149,7 @@ def test_scanvi_with_distributed_sampler(num_processes: int, save_path: str): if os.path.exists(file_path): # Check if the file exists os.remove(file_path) datasplitter_kwargs = {} - #Multi-GPU settings + # Multi-GPU settings datasplitter_kwargs["distributed_sampler"] = True datasplitter_kwargs["save_path"] = save_path datasplitter_kwargs["num_processes"] = num_processes diff --git a/tests/dataloaders/test_samplers.py b/tests/dataloaders/test_samplers.py index 58d04ed67f..09a8a5a127 100644 --- a/tests/dataloaders/test_samplers.py +++ b/tests/dataloaders/test_samplers.py @@ -1,7 +1,7 @@ +import os from math import ceil, floor import numpy as np -import os import pytest from tests.data.utils import generic_setup_adata_manager @@ -33,7 +33,7 @@ def test_batchdistributedsampler_init( drop_last=True, drop_dataset_tail=True, num_processes=num_processes, - save_path=save_path + save_path=save_path, ) assert sampler.batch_size == batch_size assert sampler.rank == 0 @@ -128,7 +128,7 @@ def check_samplers(samplers: list, sampler_batch_size: int): drop_last=drop_last, drop_dataset_tail=drop_dataset_tail, num_processes=num_processes, - save_path=save_path + save_path=save_path, ) for i in range(num_replicas) ] @@ -158,7 +158,7 @@ def test_batchdistributedsampler_indices( rank=i, batch_size=batch_size, num_processes=num_processes, - save_path=save_path + save_path=save_path, ) for i in range(num_replicas) ] From 2e3519a8ddd3d50f7d04244451a1eed69aba9414 Mon Sep 17 00:00:00 2001 From: ori-kron-wis Date: Thu, 28 Nov 2024 18:27:06 +0200 Subject: [PATCH 09/42] revert base_component --- src/scvi/nn/_base_components.py | 4 ---- tests/dataloaders/test_dataloaders.py | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/src/scvi/nn/_base_components.py b/src/scvi/nn/_base_components.py index 75cf438da5..59b537a321 100644 --- a/src/scvi/nn/_base_components.py +++ b/src/scvi/nn/_base_components.py @@ -176,10 +176,6 @@ def forward(self, x: torch.Tensor, *cat_list: int): if x.dim() == 3: one_hot_cat_list_layer = [ o.unsqueeze(0).expand((x.size(0), o.size(0), o.size(1))) - if o.dim() == 2 - else o[0] - .unsqueeze(0) - .expand((x.size(1), o.size(1), o.size(2))) for o in one_hot_cat_list ] else: diff --git a/tests/dataloaders/test_dataloaders.py b/tests/dataloaders/test_dataloaders.py index 95925e0992..880fdc3567 100644 --- a/tests/dataloaders/test_dataloaders.py +++ b/tests/dataloaders/test_dataloaders.py @@ -134,7 +134,7 @@ def test_anndataloader_distributed_sampler(num_processes: int, save_path: str): # @pytest.mark.optional -@pytest.mark.parametrize("num_processes", [2]) +@pytest.mark.parametrize("num_processes", [1,2]) def test_scanvi_with_distributed_sampler(num_processes: int, save_path: str): if torch.cuda.is_available(): adata = scvi.data.synthetic_iid() From 22c074837f4ba5f0b295c0e4095dd1e09523c57c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 28 Nov 2024 16:27:21 +0000 Subject: [PATCH 10/42] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/dataloaders/test_dataloaders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/dataloaders/test_dataloaders.py b/tests/dataloaders/test_dataloaders.py index 880fdc3567..02813ef0de 100644 --- a/tests/dataloaders/test_dataloaders.py +++ b/tests/dataloaders/test_dataloaders.py @@ -134,7 +134,7 @@ def test_anndataloader_distributed_sampler(num_processes: int, save_path: str): # @pytest.mark.optional -@pytest.mark.parametrize("num_processes", [1,2]) +@pytest.mark.parametrize("num_processes", [1, 2]) def test_scanvi_with_distributed_sampler(num_processes: int, save_path: str): if torch.cuda.is_available(): adata = scvi.data.synthetic_iid() From dabd350e1e973bef77ec5d66f0d2125ffd552462 Mon Sep 17 00:00:00 2001 From: ori-kron-wis Date: Thu, 28 Nov 2024 18:35:55 +0200 Subject: [PATCH 11/42] make cpu tests ok --- src/scvi/dataloaders/_samplers.py | 2 +- tests/dataloaders/test_dataloaders.py | 25 ++++++++++++------------- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/src/scvi/dataloaders/_samplers.py b/src/scvi/dataloaders/_samplers.py index 0d7b246940..0392384684 100644 --- a/src/scvi/dataloaders/_samplers.py +++ b/src/scvi/dataloaders/_samplers.py @@ -37,7 +37,7 @@ def __init__( drop_dataset_tail: bool = False, **kwargs, ): - if not torch.distributed.is_initialized(): + if not torch.distributed.is_initialized() and torch.cuda.is_available(): # initializes the distributed backend that takes care of synchronizing processes torch.distributed.init_process_group( "nccl", # backend that works on all systems diff --git a/tests/dataloaders/test_dataloaders.py b/tests/dataloaders/test_dataloaders.py index 02813ef0de..2000ec1f25 100644 --- a/tests/dataloaders/test_dataloaders.py +++ b/tests/dataloaders/test_dataloaders.py @@ -115,25 +115,24 @@ def multiprocessing_worker( return -# @pytest.mark.optional @pytest.mark.parametrize("num_processes", [1, 2]) def test_anndataloader_distributed_sampler(num_processes: int, save_path: str): - adata = scvi.data.synthetic_iid() - manager = generic_setup_adata_manager(adata) + if torch.cuda.is_available(): + adata = scvi.data.synthetic_iid() + manager = generic_setup_adata_manager(adata) - file_path = save_path + "/dist_file" - if os.path.exists(file_path): # Check if the file exists - os.remove(file_path) + file_path = save_path + "/dist_file" + if os.path.exists(file_path): # Check if the file exists + os.remove(file_path) - torch.multiprocessing.spawn( - multiprocessing_worker, - args=(num_processes, manager, save_path, {}), - nprocs=num_processes, - join=True, - ) + torch.multiprocessing.spawn( + multiprocessing_worker, + args=(num_processes, manager, save_path, {}), + nprocs=num_processes, + join=True, + ) -# @pytest.mark.optional @pytest.mark.parametrize("num_processes", [1, 2]) def test_scanvi_with_distributed_sampler(num_processes: int, save_path: str): if torch.cuda.is_available(): From ce7f3318e210675707d86d4761a3f4305fb3a98f Mon Sep 17 00:00:00 2001 From: Ori Kronfeld Date: Mon, 2 Dec 2024 15:52:29 +0200 Subject: [PATCH 12/42] Added running multiGPU tests for scvi,scanvi + revert some code fixes --- src/scvi/dataloaders/_ann_dataloader.py | 7 --- src/scvi/dataloaders/_concat_dataloader.py | 3 - src/scvi/dataloaders/_samplers.py | 11 ---- tests/dataloaders/test_dataloaders.py | 42 ++------------ tests/dataloaders/test_samplers.py | 28 --------- tests/model/test_scanvi.py | 66 ++++++++++++++++++++++ tests/model/test_scvi.py | 59 +++++++++++++++++++ 7 files changed, 130 insertions(+), 86 deletions(-) diff --git a/src/scvi/dataloaders/_ann_dataloader.py b/src/scvi/dataloaders/_ann_dataloader.py index ee3336ce09..82a13ac0b6 100644 --- a/src/scvi/dataloaders/_ann_dataloader.py +++ b/src/scvi/dataloaders/_ann_dataloader.py @@ -122,10 +122,6 @@ def __init__( # for fast access to sparse matrices self.kwargs.update({"batch_size": None, "shuffle": False}) else: - if "save_path" not in kwargs: - kwargs["save_path"] = "/." - if "num_processes" not in kwargs: - kwargs["num_processes"] = 1 sampler = BatchDistributedSampler( self.dataset, batch_size=batch_size, @@ -140,7 +136,4 @@ def __init__( if iter_ndarray: self.kwargs.update({"collate_fn": lambda x: x}) - for redundant_key in ["save_path", "num_processes"]: - if redundant_key in self.kwargs: - self.kwargs.pop(redundant_key) super().__init__(self.dataset, drop_last=drop_dataset_tail, **self.kwargs) diff --git a/src/scvi/dataloaders/_concat_dataloader.py b/src/scvi/dataloaders/_concat_dataloader.py index 9d35e2fb47..554f0267f0 100644 --- a/src/scvi/dataloaders/_concat_dataloader.py +++ b/src/scvi/dataloaders/_concat_dataloader.py @@ -75,9 +75,6 @@ def __init__( ) lens = [len(dl) for dl in self.dataloaders] self.largest_dl = self.dataloaders[np.argmax(lens)] - for redundant_key in ["save_path", "num_processes", "drop_dataset_tail"]: - if redundant_key in data_loader_kwargs: - data_loader_kwargs.pop(redundant_key) super().__init__(self.largest_dl, drop_last=self._drop_dataset_tail, **data_loader_kwargs) def __len__(self): diff --git a/src/scvi/dataloaders/_samplers.py b/src/scvi/dataloaders/_samplers.py index 0392384684..1e866721b7 100644 --- a/src/scvi/dataloaders/_samplers.py +++ b/src/scvi/dataloaders/_samplers.py @@ -37,20 +37,9 @@ def __init__( drop_dataset_tail: bool = False, **kwargs, ): - if not torch.distributed.is_initialized() and torch.cuda.is_available(): - # initializes the distributed backend that takes care of synchronizing processes - torch.distributed.init_process_group( - "nccl", # backend that works on all systems - init_method="file://" + kwargs["save_path"] + "/dist_file", - rank=0, - world_size=kwargs["num_processes"], - store=None, - ) for redundant_key in [ - "save_path", "pin_memory", - "num_processes", "num_workers", "persistent_workers", ]: diff --git a/tests/dataloaders/test_dataloaders.py b/tests/dataloaders/test_dataloaders.py index 2000ec1f25..9ef80f71de 100644 --- a/tests/dataloaders/test_dataloaders.py +++ b/tests/dataloaders/test_dataloaders.py @@ -7,8 +7,12 @@ import scvi from scvi import REGISTRY_KEYS -from scvi.model import SCANVI +from scvi.model import SCANVI, SCVI +import sys + +if __name__ == "__main__" and "pytest" in sys.modules: + sys.argv = sys.argv[:1] # Remove pytest arguments class TestSemiSupervisedTrainingPlan(scvi.train.SemiSupervisedTrainingPlan): def __init__(self, *args, **kwargs): @@ -131,39 +135,3 @@ def test_anndataloader_distributed_sampler(num_processes: int, save_path: str): nprocs=num_processes, join=True, ) - - -@pytest.mark.parametrize("num_processes", [1, 2]) -def test_scanvi_with_distributed_sampler(num_processes: int, save_path: str): - if torch.cuda.is_available(): - adata = scvi.data.synthetic_iid() - manager = generic_setup_adata_manager(adata) - SCANVI.setup_anndata( - adata, - "labels", - "label_0", - batch_key="batch", - ) - file_path = save_path + "/dist_file" - if os.path.exists(file_path): # Check if the file exists - os.remove(file_path) - datasplitter_kwargs = {} - # Multi-GPU settings - datasplitter_kwargs["distributed_sampler"] = True - datasplitter_kwargs["save_path"] = save_path - datasplitter_kwargs["num_processes"] = num_processes - datasplitter_kwargs["drop_dataset_tail"] = True - datasplitter_kwargs["drop_last"] = False - if num_processes == 1: - datasplitter_kwargs["distributed_sampler"] = False - datasplitter_kwargs["drop_dataset_tail"] = False - model = SCANVI(adata, n_latent=10) - - torch.multiprocessing.spawn( - multiprocessing_worker, - args=(num_processes, manager, save_path, {}), - nprocs=num_processes, - join=True, - ) - - model.train(1, datasplitter_kwargs=datasplitter_kwargs) diff --git a/tests/dataloaders/test_samplers.py b/tests/dataloaders/test_samplers.py index 09a8a5a127..ae4861d98f 100644 --- a/tests/dataloaders/test_samplers.py +++ b/tests/dataloaders/test_samplers.py @@ -1,4 +1,3 @@ -import os from math import ceil, floor import numpy as np @@ -9,10 +8,7 @@ from scvi.dataloaders import BatchDistributedSampler -@pytest.mark.parametrize("num_processes", [1, 2]) def test_batchdistributedsampler_init( - num_processes: int, - save_path: str, batch_size: int = 128, n_batches: int = 2, ): @@ -20,10 +16,6 @@ def test_batchdistributedsampler_init( manager = generic_setup_adata_manager(adata) dataset = manager.create_torch_dataset() - file_path = save_path + "/dist_file" - if os.path.exists(file_path): # Check if the file exists - os.remove(file_path) - sampler = BatchDistributedSampler( dataset, num_replicas=1, @@ -32,8 +24,6 @@ def test_batchdistributedsampler_init( shuffle=True, drop_last=True, drop_dataset_tail=True, - num_processes=num_processes, - save_path=save_path, ) assert sampler.batch_size == batch_size assert sampler.rank == 0 @@ -45,12 +35,9 @@ def test_batchdistributedsampler_init( @pytest.mark.parametrize("drop_last", [True, False]) @pytest.mark.parametrize("drop_dataset_tail", [True, False]) -@pytest.mark.parametrize("num_processes", [1, 2]) def test_batchdistributedsampler_drop_last( - num_processes: int, drop_last: bool, drop_dataset_tail: bool, - save_path: str, batch_size: int = 128, n_batches: int = 3, num_replicas: int = 2, @@ -114,10 +101,6 @@ def check_samplers(samplers: list, sampler_batch_size: int): assert len(all_indices) == effective_n_obs_per_sampler assert [len(indices) for indices in batch_indices] == batch_sizes - file_path = save_path + "/dist_file" - if os.path.exists(file_path): # Check if the file exists - os.remove(file_path) - for sampler_batch_size in [batch_size, batch_size - 1, batch_size + 1]: samplers = [ BatchDistributedSampler( @@ -127,18 +110,13 @@ def check_samplers(samplers: list, sampler_batch_size: int): batch_size=sampler_batch_size, drop_last=drop_last, drop_dataset_tail=drop_dataset_tail, - num_processes=num_processes, - save_path=save_path, ) for i in range(num_replicas) ] check_samplers(samplers, sampler_batch_size) -@pytest.mark.parametrize("num_processes", [1, 2]) def test_batchdistributedsampler_indices( - num_processes: int, - save_path: str, batch_size: int = 128, n_batches: int = 3, num_replicas: int = 2, @@ -147,18 +125,12 @@ def test_batchdistributedsampler_indices( manager = generic_setup_adata_manager(adata) dataset = manager.create_torch_dataset() - file_path = save_path + "/dist_file" - if os.path.exists(file_path): # Check if the file exists - os.remove(file_path) - samplers = [ BatchDistributedSampler( dataset, num_replicas=num_replicas, rank=i, batch_size=batch_size, - num_processes=num_processes, - save_path=save_path, ) for i in range(num_replicas) ] diff --git a/tests/model/test_scanvi.py b/tests/model/test_scanvi.py index aede994029..746a64ee31 100644 --- a/tests/model/test_scanvi.py +++ b/tests/model/test_scanvi.py @@ -5,6 +5,7 @@ import pandas as pd import pytest import torch +import subprocess from scvi.data import synthetic_iid from scvi.model import SCANVI, SCVI @@ -578,3 +579,68 @@ def check_no_logits_and_softmax(model: SCANVI): model = SCANVI.load(resave_model_path, adata) check_no_logits_and_softmax(model) + + +def test_scanvi_train_ddp(): + training_code = """ +import torch +import scvi +from scvi.model import SCANVI + +adata = scvi.data.synthetic_iid() +SCANVI.setup_anndata( + adata, + "labels", + "label_0", + batch_key="batch", +) + +model = SCANVI(adata, n_latent=10) + +model.train( + max_epochs=100, + train_size=0.5, + check_val_every_n_epoch=1, + accelerator="gpu", + devices=-1, + strategy="ddp_find_unused_parameters_true", +) + +torch.distributed.destroy_process_group() + +assert model.is_trained +""" + + if torch.cuda.is_available(): + + # Get the current working directory (CWD) + cwd = os.getcwd() + + # Define the file path for the temporary script in the current working directory + temp_file_path = os.path.join(cwd, "train_scanvi_ddp_temp.py") + + # Write the training code to the file in the current working directory + with open(temp_file_path, "w") as temp_file: + temp_file.write(training_code) + print(f"Temporary Python file created at: {temp_file_path}") + + def launch_ddp(world_size, temp_file_path): + # Command to run the script via torchrun + command = [ + "torchrun", + "--nproc_per_node="+str(world_size), # Specify the number of GPUs + temp_file_path # Your original script + ] + # Use subprocess to run the command + try: + # Run the command, wait for it to finish & clean up the temporary file + subprocess.run(command, check=True) + except subprocess.CalledProcessError as e: + os.remove(temp_file_path) + raise ValueError( + f"Error occurred while running the DDP training: {e}" + ) + finally: + os.remove(temp_file_path) + + launch_ddp(torch.cuda.device_count(), temp_file_path) diff --git a/tests/model/test_scvi.py b/tests/model/test_scvi.py index 49fe18e531..44c62f2247 100644 --- a/tests/model/test_scvi.py +++ b/tests/model/test_scvi.py @@ -11,6 +11,7 @@ from lightning.pytorch.callbacks import LearningRateMonitor from scipy.sparse import csr_matrix from torch.nn import Softplus +import subprocess import scvi from scvi.data import _constants, synthetic_iid @@ -1297,3 +1298,61 @@ def test_scvi_num_workers(): model.get_reconstruction_error() model.get_normalized_expression(transform_batch="batch_1") model.get_normalized_expression(n_samples=2) + + +def test_scvi_train_ddp(): + training_code = """ +import torch +import scvi +from scvi.model import SCVI + +adata = scvi.data.synthetic_iid() +SCVI.setup_anndata(adata) + +model = SCVI(adata) + +model.train( + max_epochs=100, + check_val_every_n_epoch=1, + accelerator="gpu", + devices=-1, + strategy="ddp_find_unused_parameters_true", +) + +torch.distributed.destroy_process_group() + +assert model.is_trained +""" + + if torch.cuda.is_available(): + # Get the current working directory (CWD) + cwd = os.getcwd() + + # Define the file path for the temporary script in the current working directory + temp_file_path = os.path.join(cwd, "train_scvi_ddp_temp.py") + + # Write the training code to the file in the current working directory + with open(temp_file_path, "w") as temp_file: + temp_file.write(training_code) + print(f"Temporary Python file created at: {temp_file_path}") + + def launch_ddp(world_size, temp_file_path): + # Command to run the script via torchrun + command = [ + "torchrun", + "--nproc_per_node="+str(world_size), # Specify the number of GPUs + temp_file_path # Your original script + ] + # Use subprocess to run the command + try: + # Run the command, wait for it to finish & clean up the temporary file + subprocess.run(command, check=True) + except subprocess.CalledProcessError as e: + os.remove(temp_file_path) + raise ValueError( + f"Error occurred while running the DDP training: {e}" + ) + finally: + os.remove(temp_file_path) + + launch_ddp(torch.cuda.device_count(), temp_file_path) From 67ec59b581c5c8a9583ca9db58b6aa5bc0136115 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 2 Dec 2024 13:52:56 +0000 Subject: [PATCH 13/42] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/scvi/dataloaders/_samplers.py | 2 -- tests/dataloaders/test_dataloaders.py | 5 ++--- tests/model/test_scanvi.py | 11 ++++------- tests/model/test_scvi.py | 10 ++++------ 4 files changed, 10 insertions(+), 18 deletions(-) diff --git a/src/scvi/dataloaders/_samplers.py b/src/scvi/dataloaders/_samplers.py index 1e866721b7..8753c6feec 100644 --- a/src/scvi/dataloaders/_samplers.py +++ b/src/scvi/dataloaders/_samplers.py @@ -1,4 +1,3 @@ -import torch from torch.utils.data import Dataset, DistributedSampler @@ -37,7 +36,6 @@ def __init__( drop_dataset_tail: bool = False, **kwargs, ): - for redundant_key in [ "pin_memory", "num_workers", diff --git a/tests/dataloaders/test_dataloaders.py b/tests/dataloaders/test_dataloaders.py index 9ef80f71de..33795890a8 100644 --- a/tests/dataloaders/test_dataloaders.py +++ b/tests/dataloaders/test_dataloaders.py @@ -1,4 +1,5 @@ import os +import sys import numpy as np import pytest @@ -7,13 +8,11 @@ import scvi from scvi import REGISTRY_KEYS -from scvi.model import SCANVI, SCVI - -import sys if __name__ == "__main__" and "pytest" in sys.modules: sys.argv = sys.argv[:1] # Remove pytest arguments + class TestSemiSupervisedTrainingPlan(scvi.train.SemiSupervisedTrainingPlan): def __init__(self, *args, **kwargs): self.n_samples_per_label = kwargs.pop("n_samples_per_label") diff --git a/tests/model/test_scanvi.py b/tests/model/test_scanvi.py index 746a64ee31..5685806d60 100644 --- a/tests/model/test_scanvi.py +++ b/tests/model/test_scanvi.py @@ -1,11 +1,11 @@ import os import pickle +import subprocess import numpy as np import pandas as pd import pytest import torch -import subprocess from scvi.data import synthetic_iid from scvi.model import SCANVI, SCVI @@ -612,7 +612,6 @@ def test_scanvi_train_ddp(): """ if torch.cuda.is_available(): - # Get the current working directory (CWD) cwd = os.getcwd() @@ -628,8 +627,8 @@ def launch_ddp(world_size, temp_file_path): # Command to run the script via torchrun command = [ "torchrun", - "--nproc_per_node="+str(world_size), # Specify the number of GPUs - temp_file_path # Your original script + "--nproc_per_node=" + str(world_size), # Specify the number of GPUs + temp_file_path, # Your original script ] # Use subprocess to run the command try: @@ -637,9 +636,7 @@ def launch_ddp(world_size, temp_file_path): subprocess.run(command, check=True) except subprocess.CalledProcessError as e: os.remove(temp_file_path) - raise ValueError( - f"Error occurred while running the DDP training: {e}" - ) + raise ValueError(f"Error occurred while running the DDP training: {e}") finally: os.remove(temp_file_path) diff --git a/tests/model/test_scvi.py b/tests/model/test_scvi.py index 44c62f2247..c181aa4220 100644 --- a/tests/model/test_scvi.py +++ b/tests/model/test_scvi.py @@ -1,6 +1,7 @@ import inspect import os import pickle +import subprocess import tarfile from unittest import mock @@ -11,7 +12,6 @@ from lightning.pytorch.callbacks import LearningRateMonitor from scipy.sparse import csr_matrix from torch.nn import Softplus -import subprocess import scvi from scvi.data import _constants, synthetic_iid @@ -1340,8 +1340,8 @@ def launch_ddp(world_size, temp_file_path): # Command to run the script via torchrun command = [ "torchrun", - "--nproc_per_node="+str(world_size), # Specify the number of GPUs - temp_file_path # Your original script + "--nproc_per_node=" + str(world_size), # Specify the number of GPUs + temp_file_path, # Your original script ] # Use subprocess to run the command try: @@ -1349,9 +1349,7 @@ def launch_ddp(world_size, temp_file_path): subprocess.run(command, check=True) except subprocess.CalledProcessError as e: os.remove(temp_file_path) - raise ValueError( - f"Error occurred while running the DDP training: {e}" - ) + raise ValueError(f"Error occurred while running the DDP training: {e}") finally: os.remove(temp_file_path) From 054674e13bca0d8b7d616ea862e6fdd1cb74cd5e Mon Sep 17 00:00:00 2001 From: Ori Kronfeld Date: Mon, 2 Dec 2024 18:08:53 +0200 Subject: [PATCH 14/42] tests fixes --- tests/model/test_models_with_minified_data.py | 6 +++--- tests/model/test_scanvi.py | 3 ++- tests/model/test_scvi.py | 3 ++- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/tests/model/test_models_with_minified_data.py b/tests/model/test_models_with_minified_data.py index 52c9013362..e88efa14e6 100644 --- a/tests/model/test_models_with_minified_data.py +++ b/tests/model/test_models_with_minified_data.py @@ -281,10 +281,10 @@ def test_validate_supported_if_minified_keep_count(): assert model.minified_data_type == ADATA_MINIFY_TYPE.LATENT_POSTERIOR_WITH_COUNTS assert model2.minified_data_type is None - assert np.allclose(model2.get_elbo(), model.get_elbo(), rtol=5e-2) + assert np.allclose(model2.get_elbo().cpu(), model.get_elbo().cpu(), rtol=5e-2) assert np.allclose( - model2.get_reconstruction_error()["reconstruction_loss"], - model.get_reconstruction_error()["reconstruction_loss"], + model2.get_reconstruction_error()["reconstruction_loss"].cpu(), + model.get_reconstruction_error()["reconstruction_loss"].cpu(), rtol=5e-2, ) assert np.allclose(model2.get_marginal_ll(), model.get_marginal_ll(), rtol=5e-2) diff --git a/tests/model/test_scanvi.py b/tests/model/test_scanvi.py index 5685806d60..b7fd0864e8 100644 --- a/tests/model/test_scanvi.py +++ b/tests/model/test_scanvi.py @@ -636,7 +636,8 @@ def launch_ddp(world_size, temp_file_path): subprocess.run(command, check=True) except subprocess.CalledProcessError as e: os.remove(temp_file_path) - raise ValueError(f"Error occurred while running the DDP training: {e}") + print(f"Error occurred while running the DDP training: {e}") + raise finally: os.remove(temp_file_path) diff --git a/tests/model/test_scvi.py b/tests/model/test_scvi.py index c181aa4220..f499fe7f75 100644 --- a/tests/model/test_scvi.py +++ b/tests/model/test_scvi.py @@ -1349,7 +1349,8 @@ def launch_ddp(world_size, temp_file_path): subprocess.run(command, check=True) except subprocess.CalledProcessError as e: os.remove(temp_file_path) - raise ValueError(f"Error occurred while running the DDP training: {e}") + print(f"Error occurred while running the DDP training: {e}") + raise finally: os.remove(temp_file_path) From 00f5c37d1da6f4f728abd6ebff84792a2afc3167 Mon Sep 17 00:00:00 2001 From: Ori Kronfeld Date: Mon, 2 Dec 2024 21:34:53 +0200 Subject: [PATCH 15/42] revert scanvi distributed test --- src/scvi/model/_scanvi.py | 3 +- tests/dataloaders/test_datasplitter.py | 18 +++- tests/model/test_scanvi.py | 122 ++++++++++++------------- 3 files changed, 79 insertions(+), 64 deletions(-) diff --git a/src/scvi/model/_scanvi.py b/src/scvi/model/_scanvi.py index 084d83be1f..836a22d492 100644 --- a/src/scvi/model/_scanvi.py +++ b/src/scvi/model/_scanvi.py @@ -25,7 +25,7 @@ NumericalObsField, ) from scvi.dataloaders import SemiSupervisedDataSplitter -from scvi.model._utils import _init_library_size, get_max_epochs_heuristic +from scvi.model._utils import _init_library_size, get_max_epochs_heuristic, use_distributed_sampler from scvi.module import SCANVAE from scvi.train import SemiSupervisedTrainingPlan, TrainRunner from scvi.train._callbacks import SubSampleLabels @@ -411,6 +411,7 @@ def train( shuffle_set_split=shuffle_set_split, n_samples_per_label=n_samples_per_label, batch_size=batch_size, + #distributed_sampler=use_distributed_sampler(trainer_kwargs.get("strategy", None)), **datasplitter_kwargs, ) training_plan = self._training_plan_cls(self.module, self.n_labels, **plan_kwargs) diff --git a/tests/dataloaders/test_datasplitter.py b/tests/dataloaders/test_datasplitter.py index c38e4ba57e..b8b9e8d72e 100644 --- a/tests/dataloaders/test_datasplitter.py +++ b/tests/dataloaders/test_datasplitter.py @@ -5,7 +5,7 @@ import numpy as np import pytest from sparse_utils import TestSparseModel -from tests.data.utils import generic_setup_adata_manager +from tests.data.utils import generic_setup_adata_manager, scanvi_setup_adata_manager import scvi @@ -17,7 +17,7 @@ def test_datasplitter_shuffle(self): with pytest.raises(ValueError) as excinfo: scvi.dataloaders.DataSplitter( - manager, train_size=1.5, validation_size=0.3, shuffle_set_split=False + manager, train_size=1.5, validation_size=0.3, shuffle_set_split=False, ) assert str(excinfo.value) == "Invalid train_size. Must be: 0 < train_size <= 1" @@ -188,6 +188,20 @@ def test_datasplitter_external_with_duplicates(self): scvi.dataloaders.DataSplitter(manager, external_indexing=[train_ind]) assert str(excinfo.value) == "There are duplicate indexing in train set" + def test_datasplitter_distributed_sampler(self): + adata = scvi.data.synthetic_iid() + manager = generic_setup_adata_manager(adata) + datasplitter_kwargs = {} + datasplitter_kwargs['distributed_sampler'] = True + + scvi.dataloaders.DataSplitter(manager, **datasplitter_kwargs,) + + def test_semisupervised_datasplitter_distributed_sampler(self): + adata = scvi.data.synthetic_iid() + manager = scanvi_setup_adata_manager(adata, labels_key="labels",unlabeled_category="label_0") + datasplitter_kwargs = {} + datasplitter_kwargs['distributed_sampler'] = True + scvi.dataloaders.SemiSupervisedDataSplitter(adata_manager=manager, **datasplitter_kwargs,) @pytest.mark.parametrize("sparse_format", ["csr_matrix", "csc_matrix"]) def test_datasplitter_load_sparse_tensor( diff --git a/tests/model/test_scanvi.py b/tests/model/test_scanvi.py index b7fd0864e8..199d895abc 100644 --- a/tests/model/test_scanvi.py +++ b/tests/model/test_scanvi.py @@ -581,64 +581,64 @@ def check_no_logits_and_softmax(model: SCANVI): check_no_logits_and_softmax(model) -def test_scanvi_train_ddp(): - training_code = """ -import torch -import scvi -from scvi.model import SCANVI - -adata = scvi.data.synthetic_iid() -SCANVI.setup_anndata( - adata, - "labels", - "label_0", - batch_key="batch", -) - -model = SCANVI(adata, n_latent=10) - -model.train( - max_epochs=100, - train_size=0.5, - check_val_every_n_epoch=1, - accelerator="gpu", - devices=-1, - strategy="ddp_find_unused_parameters_true", -) - -torch.distributed.destroy_process_group() - -assert model.is_trained -""" - - if torch.cuda.is_available(): - # Get the current working directory (CWD) - cwd = os.getcwd() - - # Define the file path for the temporary script in the current working directory - temp_file_path = os.path.join(cwd, "train_scanvi_ddp_temp.py") - - # Write the training code to the file in the current working directory - with open(temp_file_path, "w") as temp_file: - temp_file.write(training_code) - print(f"Temporary Python file created at: {temp_file_path}") - - def launch_ddp(world_size, temp_file_path): - # Command to run the script via torchrun - command = [ - "torchrun", - "--nproc_per_node=" + str(world_size), # Specify the number of GPUs - temp_file_path, # Your original script - ] - # Use subprocess to run the command - try: - # Run the command, wait for it to finish & clean up the temporary file - subprocess.run(command, check=True) - except subprocess.CalledProcessError as e: - os.remove(temp_file_path) - print(f"Error occurred while running the DDP training: {e}") - raise - finally: - os.remove(temp_file_path) - - launch_ddp(torch.cuda.device_count(), temp_file_path) +# def test_scanvi_train_ddp(): +# training_code = """ +# import torch +# import scvi +# from scvi.model import SCANVI +# +# adata = scvi.data.synthetic_iid() +# SCANVI.setup_anndata( +# adata, +# "labels", +# "label_0", +# batch_key="batch", +# ) +# +# model = SCANVI(adata, n_latent=10) +# +# model.train( +# max_epochs=100, +# train_size=0.5, +# check_val_every_n_epoch=1, +# accelerator="gpu", +# devices=-1, +# strategy="ddp_find_unused_parameters_true", +# ) +# +# torch.distributed.destroy_process_group() +# +# assert model.is_trained +# """ +# +# if torch.cuda.is_available(): +# # Get the current working directory (CWD) +# cwd = os.getcwd() +# +# # Define the file path for the temporary script in the current working directory +# temp_file_path = os.path.join(cwd, "train_scanvi_ddp_temp.py") +# +# # Write the training code to the file in the current working directory +# with open(temp_file_path, "w") as temp_file: +# temp_file.write(training_code) +# print(f"Temporary Python file created at: {temp_file_path}") +# +# def launch_ddp(world_size, temp_file_path): +# # Command to run the script via torchrun +# command = [ +# "torchrun", +# "--nproc_per_node=" + str(world_size), # Specify the number of GPUs +# temp_file_path, # Your original script +# ] +# # Use subprocess to run the command +# try: +# # Run the command, wait for it to finish & clean up the temporary file +# subprocess.run(command, check=True) +# except subprocess.CalledProcessError as e: +# os.remove(temp_file_path) +# print(f"Error occurred while running the DDP training: {e}") +# raise +# finally: +# os.remove(temp_file_path) +# +# launch_ddp(torch.cuda.device_count(), temp_file_path) From 19b382f40e44b42ee092d068bd6c358ec8a8cb24 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 2 Dec 2024 19:35:14 +0000 Subject: [PATCH 16/42] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/scvi/model/_scanvi.py | 4 ++-- tests/dataloaders/test_datasplitter.py | 24 ++++++++++++++++++------ tests/model/test_scanvi.py | 1 - 3 files changed, 20 insertions(+), 9 deletions(-) diff --git a/src/scvi/model/_scanvi.py b/src/scvi/model/_scanvi.py index 836a22d492..edd75e5324 100644 --- a/src/scvi/model/_scanvi.py +++ b/src/scvi/model/_scanvi.py @@ -25,7 +25,7 @@ NumericalObsField, ) from scvi.dataloaders import SemiSupervisedDataSplitter -from scvi.model._utils import _init_library_size, get_max_epochs_heuristic, use_distributed_sampler +from scvi.model._utils import _init_library_size, get_max_epochs_heuristic from scvi.module import SCANVAE from scvi.train import SemiSupervisedTrainingPlan, TrainRunner from scvi.train._callbacks import SubSampleLabels @@ -411,7 +411,7 @@ def train( shuffle_set_split=shuffle_set_split, n_samples_per_label=n_samples_per_label, batch_size=batch_size, - #distributed_sampler=use_distributed_sampler(trainer_kwargs.get("strategy", None)), + # distributed_sampler=use_distributed_sampler(trainer_kwargs.get("strategy", None)), **datasplitter_kwargs, ) training_plan = self._training_plan_cls(self.module, self.n_labels, **plan_kwargs) diff --git a/tests/dataloaders/test_datasplitter.py b/tests/dataloaders/test_datasplitter.py index b8b9e8d72e..aad5d54b36 100644 --- a/tests/dataloaders/test_datasplitter.py +++ b/tests/dataloaders/test_datasplitter.py @@ -17,7 +17,10 @@ def test_datasplitter_shuffle(self): with pytest.raises(ValueError) as excinfo: scvi.dataloaders.DataSplitter( - manager, train_size=1.5, validation_size=0.3, shuffle_set_split=False, + manager, + train_size=1.5, + validation_size=0.3, + shuffle_set_split=False, ) assert str(excinfo.value) == "Invalid train_size. Must be: 0 < train_size <= 1" @@ -192,16 +195,25 @@ def test_datasplitter_distributed_sampler(self): adata = scvi.data.synthetic_iid() manager = generic_setup_adata_manager(adata) datasplitter_kwargs = {} - datasplitter_kwargs['distributed_sampler'] = True + datasplitter_kwargs["distributed_sampler"] = True - scvi.dataloaders.DataSplitter(manager, **datasplitter_kwargs,) + scvi.dataloaders.DataSplitter( + manager, + **datasplitter_kwargs, + ) def test_semisupervised_datasplitter_distributed_sampler(self): adata = scvi.data.synthetic_iid() - manager = scanvi_setup_adata_manager(adata, labels_key="labels",unlabeled_category="label_0") + manager = scanvi_setup_adata_manager( + adata, labels_key="labels", unlabeled_category="label_0" + ) datasplitter_kwargs = {} - datasplitter_kwargs['distributed_sampler'] = True - scvi.dataloaders.SemiSupervisedDataSplitter(adata_manager=manager, **datasplitter_kwargs,) + datasplitter_kwargs["distributed_sampler"] = True + scvi.dataloaders.SemiSupervisedDataSplitter( + adata_manager=manager, + **datasplitter_kwargs, + ) + @pytest.mark.parametrize("sparse_format", ["csr_matrix", "csc_matrix"]) def test_datasplitter_load_sparse_tensor( diff --git a/tests/model/test_scanvi.py b/tests/model/test_scanvi.py index 199d895abc..7d8fced0d2 100644 --- a/tests/model/test_scanvi.py +++ b/tests/model/test_scanvi.py @@ -1,6 +1,5 @@ import os import pickle -import subprocess import numpy as np import pandas as pd From 523941368e8641e19f0831d01a77493c8438d89d Mon Sep 17 00:00:00 2001 From: Ori Kronfeld Date: Mon, 2 Dec 2024 21:36:37 +0200 Subject: [PATCH 17/42] revert scanvi distributed test --- tests/dataloaders/test_dataloaders.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/dataloaders/test_dataloaders.py b/tests/dataloaders/test_dataloaders.py index 33795890a8..12802a02da 100644 --- a/tests/dataloaders/test_dataloaders.py +++ b/tests/dataloaders/test_dataloaders.py @@ -9,10 +9,6 @@ import scvi from scvi import REGISTRY_KEYS -if __name__ == "__main__" and "pytest" in sys.modules: - sys.argv = sys.argv[:1] # Remove pytest arguments - - class TestSemiSupervisedTrainingPlan(scvi.train.SemiSupervisedTrainingPlan): def __init__(self, *args, **kwargs): self.n_samples_per_label = kwargs.pop("n_samples_per_label") From d61aebe1c000de8f9002ae6b3d5742f378a0e899 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 2 Dec 2024 19:38:25 +0000 Subject: [PATCH 18/42] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/dataloaders/test_dataloaders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/dataloaders/test_dataloaders.py b/tests/dataloaders/test_dataloaders.py index 12802a02da..60e5e7ea34 100644 --- a/tests/dataloaders/test_dataloaders.py +++ b/tests/dataloaders/test_dataloaders.py @@ -1,5 +1,4 @@ import os -import sys import numpy as np import pytest @@ -9,6 +8,7 @@ import scvi from scvi import REGISTRY_KEYS + class TestSemiSupervisedTrainingPlan(scvi.train.SemiSupervisedTrainingPlan): def __init__(self, *args, **kwargs): self.n_samples_per_label = kwargs.pop("n_samples_per_label") From 0d424656acd0288e4155b023eab22a3a9be43e6c Mon Sep 17 00:00:00 2001 From: Ori Kronfeld Date: Tue, 3 Dec 2024 09:54:26 +0200 Subject: [PATCH 19/42] Fixed tests --- tests/dataloaders/test_dataloaders.py | 35 ++++++++++++++++++++++++++ tests/dataloaders/test_datasplitter.py | 25 +----------------- tests/model/test_scvi.py | 7 ++---- 3 files changed, 38 insertions(+), 29 deletions(-) diff --git a/tests/dataloaders/test_dataloaders.py b/tests/dataloaders/test_dataloaders.py index 60e5e7ea34..737d674dd7 100644 --- a/tests/dataloaders/test_dataloaders.py +++ b/tests/dataloaders/test_dataloaders.py @@ -4,6 +4,7 @@ import pytest import torch from tests.data.utils import generic_setup_adata_manager +from scvi.model import SCANVI import scvi from scvi import REGISTRY_KEYS @@ -130,3 +131,37 @@ def test_anndataloader_distributed_sampler(num_processes: int, save_path: str): nprocs=num_processes, join=True, ) + + +@pytest.mark.parametrize("num_processes", [1]) +def test_scanvi_with_distributed_sampler(num_processes: int, save_path: str): + if torch.cuda.is_available(): + adata = scvi.data.synthetic_iid() + manager = generic_setup_adata_manager(adata) + SCANVI.setup_anndata( + adata, + "labels", + "label_0", + batch_key="batch", + ) + file_path = save_path + "/dist_file" + if os.path.exists(file_path): # Check if the file exists + os.remove(file_path) + datasplitter_kwargs = {} + # Multi-GPU settings + datasplitter_kwargs["distributed_sampler"] = True + datasplitter_kwargs["drop_last"] = False + if num_processes == 1: + datasplitter_kwargs["distributed_sampler"] = False + model = SCANVI(adata, n_latent=10) + + # initializes the distributed backend that takes care of synchronizing processes + torch.distributed.init_process_group( + "nccl", # backend that works on all systems + init_method=f"file://{save_path}/dist_file", + rank=0, + world_size=num_processes, + store=None, + ) + + model.train(1, datasplitter_kwargs=datasplitter_kwargs) diff --git a/tests/dataloaders/test_datasplitter.py b/tests/dataloaders/test_datasplitter.py index aad5d54b36..3162194419 100644 --- a/tests/dataloaders/test_datasplitter.py +++ b/tests/dataloaders/test_datasplitter.py @@ -5,7 +5,7 @@ import numpy as np import pytest from sparse_utils import TestSparseModel -from tests.data.utils import generic_setup_adata_manager, scanvi_setup_adata_manager +from tests.data.utils import generic_setup_adata_manager import scvi @@ -191,29 +191,6 @@ def test_datasplitter_external_with_duplicates(self): scvi.dataloaders.DataSplitter(manager, external_indexing=[train_ind]) assert str(excinfo.value) == "There are duplicate indexing in train set" - def test_datasplitter_distributed_sampler(self): - adata = scvi.data.synthetic_iid() - manager = generic_setup_adata_manager(adata) - datasplitter_kwargs = {} - datasplitter_kwargs["distributed_sampler"] = True - - scvi.dataloaders.DataSplitter( - manager, - **datasplitter_kwargs, - ) - - def test_semisupervised_datasplitter_distributed_sampler(self): - adata = scvi.data.synthetic_iid() - manager = scanvi_setup_adata_manager( - adata, labels_key="labels", unlabeled_category="label_0" - ) - datasplitter_kwargs = {} - datasplitter_kwargs["distributed_sampler"] = True - scvi.dataloaders.SemiSupervisedDataSplitter( - adata_manager=manager, - **datasplitter_kwargs, - ) - @pytest.mark.parametrize("sparse_format", ["csr_matrix", "csc_matrix"]) def test_datasplitter_load_sparse_tensor( diff --git a/tests/model/test_scvi.py b/tests/model/test_scvi.py index f499fe7f75..334665a31b 100644 --- a/tests/model/test_scvi.py +++ b/tests/model/test_scvi.py @@ -1300,7 +1300,7 @@ def test_scvi_num_workers(): model.get_normalized_expression(n_samples=2) -def test_scvi_train_ddp(): +def test_scvi_train_ddp(save_path: str): training_code = """ import torch import scvi @@ -1325,11 +1325,8 @@ def test_scvi_train_ddp(): """ if torch.cuda.is_available(): - # Get the current working directory (CWD) - cwd = os.getcwd() - # Define the file path for the temporary script in the current working directory - temp_file_path = os.path.join(cwd, "train_scvi_ddp_temp.py") + temp_file_path = os.path.join(save_path, "train_scvi_ddp_temp.py") # Write the training code to the file in the current working directory with open(temp_file_path, "w") as temp_file: From f8e44b5318a306384ac6cea3af9d8ac829543549 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 3 Dec 2024 07:54:43 +0000 Subject: [PATCH 20/42] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/dataloaders/test_dataloaders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/dataloaders/test_dataloaders.py b/tests/dataloaders/test_dataloaders.py index 737d674dd7..e4f62b1699 100644 --- a/tests/dataloaders/test_dataloaders.py +++ b/tests/dataloaders/test_dataloaders.py @@ -4,10 +4,10 @@ import pytest import torch from tests.data.utils import generic_setup_adata_manager -from scvi.model import SCANVI import scvi from scvi import REGISTRY_KEYS +from scvi.model import SCANVI class TestSemiSupervisedTrainingPlan(scvi.train.SemiSupervisedTrainingPlan): From 7360989982e6b3c80bd4b893a58e5514d52b3ce3 Mon Sep 17 00:00:00 2001 From: Ori Kronfeld Date: Tue, 3 Dec 2024 09:59:56 +0200 Subject: [PATCH 21/42] fix pre commit --- tests/dataloaders/test_dataloaders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/dataloaders/test_dataloaders.py b/tests/dataloaders/test_dataloaders.py index e4f62b1699..f6418934b9 100644 --- a/tests/dataloaders/test_dataloaders.py +++ b/tests/dataloaders/test_dataloaders.py @@ -137,7 +137,7 @@ def test_anndataloader_distributed_sampler(num_processes: int, save_path: str): def test_scanvi_with_distributed_sampler(num_processes: int, save_path: str): if torch.cuda.is_available(): adata = scvi.data.synthetic_iid() - manager = generic_setup_adata_manager(adata) + #manager = scanvi_setup_adata_manager(adata) SCANVI.setup_anndata( adata, "labels", From c16358b0fcdf33c122947bc6253841371bd5362e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 3 Dec 2024 08:00:21 +0000 Subject: [PATCH 22/42] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/dataloaders/test_dataloaders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/dataloaders/test_dataloaders.py b/tests/dataloaders/test_dataloaders.py index f6418934b9..54b15cdebd 100644 --- a/tests/dataloaders/test_dataloaders.py +++ b/tests/dataloaders/test_dataloaders.py @@ -137,7 +137,7 @@ def test_anndataloader_distributed_sampler(num_processes: int, save_path: str): def test_scanvi_with_distributed_sampler(num_processes: int, save_path: str): if torch.cuda.is_available(): adata = scvi.data.synthetic_iid() - #manager = scanvi_setup_adata_manager(adata) + # manager = scanvi_setup_adata_manager(adata) SCANVI.setup_anndata( adata, "labels", From b37efd5f532c4a9d35bacaf72f16cc49f1ab36bb Mon Sep 17 00:00:00 2001 From: Ori Kronfeld Date: Tue, 3 Dec 2024 14:01:15 +0200 Subject: [PATCH 23/42] test precommit --- tests/dataloaders/test_dataloaders.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/dataloaders/test_dataloaders.py b/tests/dataloaders/test_dataloaders.py index 54b15cdebd..7632a720be 100644 --- a/tests/dataloaders/test_dataloaders.py +++ b/tests/dataloaders/test_dataloaders.py @@ -3,7 +3,7 @@ import numpy as np import pytest import torch -from tests.data.utils import generic_setup_adata_manager +from tests.data.utils import generic_setup_adata_manager, scanvi_setup_adata_manager import scvi from scvi import REGISTRY_KEYS @@ -137,7 +137,7 @@ def test_anndataloader_distributed_sampler(num_processes: int, save_path: str): def test_scanvi_with_distributed_sampler(num_processes: int, save_path: str): if torch.cuda.is_available(): adata = scvi.data.synthetic_iid() - # manager = scanvi_setup_adata_manager(adata) + manager = scanvi_setup_adata_manager(adata) SCANVI.setup_anndata( adata, "labels", From 524516305ca1326463928b2227f49162ccfb5712 Mon Sep 17 00:00:00 2001 From: Ori Kronfeld Date: Tue, 3 Dec 2024 14:10:22 +0200 Subject: [PATCH 24/42] test precommit --- tests/dataloaders/test_dataloaders.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/dataloaders/test_dataloaders.py b/tests/dataloaders/test_dataloaders.py index 7632a720be..0c6bec9804 100644 --- a/tests/dataloaders/test_dataloaders.py +++ b/tests/dataloaders/test_dataloaders.py @@ -3,7 +3,7 @@ import numpy as np import pytest import torch -from tests.data.utils import generic_setup_adata_manager, scanvi_setup_adata_manager +from tests.data.utils import generic_setup_adata_manager import scvi from scvi import REGISTRY_KEYS @@ -137,7 +137,6 @@ def test_anndataloader_distributed_sampler(num_processes: int, save_path: str): def test_scanvi_with_distributed_sampler(num_processes: int, save_path: str): if torch.cuda.is_available(): adata = scvi.data.synthetic_iid() - manager = scanvi_setup_adata_manager(adata) SCANVI.setup_anndata( adata, "labels", From 6f58e43478822e6324d6971bd1d919a2c40696b8 Mon Sep 17 00:00:00 2001 From: Ori Kronfeld Date: Tue, 3 Dec 2024 14:49:39 +0200 Subject: [PATCH 25/42] fix cuda test file --- .github/workflows/test_linux_cuda.yml | 2 +- tests/model/test_scvi.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test_linux_cuda.yml b/.github/workflows/test_linux_cuda.yml index 0eca4d65c8..d0bc164c9f 100644 --- a/.github/workflows/test_linux_cuda.yml +++ b/.github/workflows/test_linux_cuda.yml @@ -59,7 +59,7 @@ jobs: run: | python -m pip install --upgrade pip wheel uv python -m uv pip install --system "scvi-tools[tests] @ ." - python -m pip install jax[cuda] + python -m pip install jax[cuda12] python -m pip install nvidia-nccl-cu12 - name: Run pytest diff --git a/tests/model/test_scvi.py b/tests/model/test_scvi.py index 334665a31b..439976715f 100644 --- a/tests/model/test_scvi.py +++ b/tests/model/test_scvi.py @@ -1300,7 +1300,7 @@ def test_scvi_num_workers(): model.get_normalized_expression(n_samples=2) -def test_scvi_train_ddp(save_path: str): +def test_scvi_train_ddp(save_path: str = "."): training_code = """ import torch import scvi @@ -1312,7 +1312,7 @@ def test_scvi_train_ddp(save_path: str): model = SCVI(adata) model.train( - max_epochs=100, + max_epochs=1, check_val_every_n_epoch=1, accelerator="gpu", devices=-1, From 2b9fe3e74c5b26c033583e13766192dc8451742b Mon Sep 17 00:00:00 2001 From: Ori Kronfeld Date: Tue, 3 Dec 2024 15:59:04 +0200 Subject: [PATCH 26/42] added jax accelerator for tests --- tests/model/test_jaxscvi.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/tests/model/test_jaxscvi.py b/tests/model/test_jaxscvi.py index dac25b7c89..055d6b8dd0 100644 --- a/tests/model/test_jaxscvi.py +++ b/tests/model/test_jaxscvi.py @@ -2,6 +2,7 @@ import numpy as np import pytest +import torch from flax import linen as nn from scvi.data import synthetic_iid @@ -10,17 +11,19 @@ def test_jax_scvi(n_latent=5): + accelerator = "gpu" if torch.cuda.is_available() else "cpu" + adata = synthetic_iid() JaxSCVI.setup_anndata( adata, batch_key="batch", ) model = JaxSCVI(adata, n_latent=n_latent) - model.train(2, train_size=0.5, check_val_every_n_epoch=1) + model.train(2, train_size=0.5, check_val_every_n_epoch=1, accelerator=accelerator) model.get_latent_representation() model = JaxSCVI(adata, n_latent=n_latent, gene_likelihood="poisson") - model.train(1, train_size=0.5) + model.train(1, train_size=0.5, accelerator=accelerator) z1 = model.get_latent_representation(give_mean=True, n_samples=1) assert z1.ndim == 2 z2 = model.get_latent_representation(give_mean=False, n_samples=15) @@ -29,6 +32,7 @@ def test_jax_scvi(n_latent=5): def test_jax_scvi_training(n_latent: int = 5, dropout_rate: float = 0.1): + accelerator = "gpu" if torch.cuda.is_available() else "cpu" adata = synthetic_iid() JaxSCVI.setup_anndata( adata, @@ -42,7 +46,7 @@ def test_jax_scvi_training(n_latent: int = 5, dropout_rate: float = 0.1): mock_dropout = mock.Mock() mock_dropout.side_effect = lambda h, **_kwargs: h mock_dropout_cls.return_value = mock_dropout - model.train(1, train_size=0.5, check_val_every_n_epoch=1) + model.train(1, train_size=0.5, check_val_every_n_epoch=1, accelerator=accelerator) assert not model.module.training mock_dropout_cls.assert_called() @@ -53,13 +57,14 @@ def test_jax_scvi_training(n_latent: int = 5, dropout_rate: float = 0.1): def test_jax_scvi_save_load(save_path: str, n_latent: int = 5): + accelerator = "gpu" if torch.cuda.is_available() else "cpu" adata = synthetic_iid() JaxSCVI.setup_anndata( adata, batch_key="batch", ) model = JaxSCVI(adata, n_latent=n_latent) - model.train(2, train_size=0.5, check_val_every_n_epoch=1) + model.train(2, train_size=0.5, check_val_every_n_epoch=1, accelerator=accelerator) z1 = model.get_latent_representation(adata) model.save(save_path, overwrite=True, save_anndata=True) model.view_setup_args(save_path) From 6f51b8e5d3b879616daef341dd6e3a13f3e21a50 Mon Sep 17 00:00:00 2001 From: Ori Kronfeld Date: Tue, 3 Dec 2024 16:09:44 +0200 Subject: [PATCH 27/42] make jax tests private so they will not run here --- tests/model/test_jaxscvi.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/model/test_jaxscvi.py b/tests/model/test_jaxscvi.py index 055d6b8dd0..1034b03956 100644 --- a/tests/model/test_jaxscvi.py +++ b/tests/model/test_jaxscvi.py @@ -10,6 +10,7 @@ from scvi.utils import attrdict +@pytest.mark.private def test_jax_scvi(n_latent=5): accelerator = "gpu" if torch.cuda.is_available() else "cpu" @@ -31,6 +32,7 @@ def test_jax_scvi(n_latent=5): assert z2.shape[0] == 15 +@pytest.mark.private def test_jax_scvi_training(n_latent: int = 5, dropout_rate: float = 0.1): accelerator = "gpu" if torch.cuda.is_available() else "cpu" adata = synthetic_iid() @@ -56,6 +58,7 @@ def test_jax_scvi_training(n_latent: int = 5, dropout_rate: float = 0.1): ) +@pytest.mark.private def test_jax_scvi_save_load(save_path: str, n_latent: int = 5): accelerator = "gpu" if torch.cuda.is_available() else "cpu" adata = synthetic_iid() From 7cd2d437d7f4ab7356b6e15b0c8907491c5c4439 Mon Sep 17 00:00:00 2001 From: Ori Kronfeld Date: Tue, 3 Dec 2024 16:14:21 +0200 Subject: [PATCH 28/42] fix test scvi ddp --- tests/model/test_scvi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/model/test_scvi.py b/tests/model/test_scvi.py index 439976715f..44e15ddfcc 100644 --- a/tests/model/test_scvi.py +++ b/tests/model/test_scvi.py @@ -1300,7 +1300,7 @@ def test_scvi_num_workers(): model.get_normalized_expression(n_samples=2) -def test_scvi_train_ddp(save_path: str = "."): +def test_scvi_train_ddp(save_path: str): training_code = """ import torch import scvi From 72e4682e7f49d3d23711ab60a1de14e301a012e6 Mon Sep 17 00:00:00 2001 From: Ori Kronfeld Date: Tue, 3 Dec 2024 16:26:45 +0200 Subject: [PATCH 29/42] revert jax tests --- tests/model/test_jaxscvi.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/tests/model/test_jaxscvi.py b/tests/model/test_jaxscvi.py index 1034b03956..0d4a97a914 100644 --- a/tests/model/test_jaxscvi.py +++ b/tests/model/test_jaxscvi.py @@ -2,7 +2,8 @@ import numpy as np import pytest -import torch + +# import torch from flax import linen as nn from scvi.data import synthetic_iid @@ -10,9 +11,8 @@ from scvi.utils import attrdict -@pytest.mark.private def test_jax_scvi(n_latent=5): - accelerator = "gpu" if torch.cuda.is_available() else "cpu" + # accelerator = "gpu" if torch.cuda.is_available() else "cpu" adata = synthetic_iid() JaxSCVI.setup_anndata( @@ -20,11 +20,11 @@ def test_jax_scvi(n_latent=5): batch_key="batch", ) model = JaxSCVI(adata, n_latent=n_latent) - model.train(2, train_size=0.5, check_val_every_n_epoch=1, accelerator=accelerator) + model.train(2, train_size=0.5, check_val_every_n_epoch=1, accelerator="cpu") model.get_latent_representation() model = JaxSCVI(adata, n_latent=n_latent, gene_likelihood="poisson") - model.train(1, train_size=0.5, accelerator=accelerator) + model.train(1, train_size=0.5, accelerator="cpu") z1 = model.get_latent_representation(give_mean=True, n_samples=1) assert z1.ndim == 2 z2 = model.get_latent_representation(give_mean=False, n_samples=15) @@ -32,9 +32,8 @@ def test_jax_scvi(n_latent=5): assert z2.shape[0] == 15 -@pytest.mark.private def test_jax_scvi_training(n_latent: int = 5, dropout_rate: float = 0.1): - accelerator = "gpu" if torch.cuda.is_available() else "cpu" + # accelerator = "gpu" if torch.cuda.is_available() else "cpu" adata = synthetic_iid() JaxSCVI.setup_anndata( adata, @@ -48,7 +47,7 @@ def test_jax_scvi_training(n_latent: int = 5, dropout_rate: float = 0.1): mock_dropout = mock.Mock() mock_dropout.side_effect = lambda h, **_kwargs: h mock_dropout_cls.return_value = mock_dropout - model.train(1, train_size=0.5, check_val_every_n_epoch=1, accelerator=accelerator) + model.train(1, train_size=0.5, check_val_every_n_epoch=1, accelerator="cpu") assert not model.module.training mock_dropout_cls.assert_called() @@ -58,16 +57,15 @@ def test_jax_scvi_training(n_latent: int = 5, dropout_rate: float = 0.1): ) -@pytest.mark.private def test_jax_scvi_save_load(save_path: str, n_latent: int = 5): - accelerator = "gpu" if torch.cuda.is_available() else "cpu" + # accelerator = "gpu" if torch.cuda.is_available() else "cpu" adata = synthetic_iid() JaxSCVI.setup_anndata( adata, batch_key="batch", ) model = JaxSCVI(adata, n_latent=n_latent) - model.train(2, train_size=0.5, check_val_every_n_epoch=1, accelerator=accelerator) + model.train(2, train_size=0.5, check_val_every_n_epoch=1, accelerator="cpu") z1 = model.get_latent_representation(adata) model.save(save_path, overwrite=True, save_anndata=True) model.view_setup_args(save_path) From 861e5892ac24128e35fab33d80158dd1defa95d1 Mon Sep 17 00:00:00 2001 From: Ori Kronfeld Date: Tue, 3 Dec 2024 23:41:33 +0200 Subject: [PATCH 30/42] comment out the ddp test? --- tests/model/test_jaxscvi.py | 8 +-- tests/model/test_scvi.py | 105 ++++++++++++++++++------------------ 2 files changed, 56 insertions(+), 57 deletions(-) diff --git a/tests/model/test_jaxscvi.py b/tests/model/test_jaxscvi.py index 0d4a97a914..1d6ee6b313 100644 --- a/tests/model/test_jaxscvi.py +++ b/tests/model/test_jaxscvi.py @@ -20,11 +20,11 @@ def test_jax_scvi(n_latent=5): batch_key="batch", ) model = JaxSCVI(adata, n_latent=n_latent) - model.train(2, train_size=0.5, check_val_every_n_epoch=1, accelerator="cpu") + model.train(2, train_size=0.5, check_val_every_n_epoch=1) model.get_latent_representation() model = JaxSCVI(adata, n_latent=n_latent, gene_likelihood="poisson") - model.train(1, train_size=0.5, accelerator="cpu") + model.train(1, train_size=0.5) z1 = model.get_latent_representation(give_mean=True, n_samples=1) assert z1.ndim == 2 z2 = model.get_latent_representation(give_mean=False, n_samples=15) @@ -47,7 +47,7 @@ def test_jax_scvi_training(n_latent: int = 5, dropout_rate: float = 0.1): mock_dropout = mock.Mock() mock_dropout.side_effect = lambda h, **_kwargs: h mock_dropout_cls.return_value = mock_dropout - model.train(1, train_size=0.5, check_val_every_n_epoch=1, accelerator="cpu") + model.train(1, train_size=0.5, check_val_every_n_epoch=1) assert not model.module.training mock_dropout_cls.assert_called() @@ -65,7 +65,7 @@ def test_jax_scvi_save_load(save_path: str, n_latent: int = 5): batch_key="batch", ) model = JaxSCVI(adata, n_latent=n_latent) - model.train(2, train_size=0.5, check_val_every_n_epoch=1, accelerator="cpu") + model.train(2, train_size=0.5, check_val_every_n_epoch=1) z1 = model.get_latent_representation(adata) model.save(save_path, overwrite=True, save_anndata=True) model.view_setup_args(save_path) diff --git a/tests/model/test_scvi.py b/tests/model/test_scvi.py index 44e15ddfcc..acbee48228 100644 --- a/tests/model/test_scvi.py +++ b/tests/model/test_scvi.py @@ -1,7 +1,6 @@ import inspect import os import pickle -import subprocess import tarfile from unittest import mock @@ -1300,55 +1299,55 @@ def test_scvi_num_workers(): model.get_normalized_expression(n_samples=2) -def test_scvi_train_ddp(save_path: str): - training_code = """ -import torch -import scvi -from scvi.model import SCVI - -adata = scvi.data.synthetic_iid() -SCVI.setup_anndata(adata) - -model = SCVI(adata) - -model.train( - max_epochs=1, - check_val_every_n_epoch=1, - accelerator="gpu", - devices=-1, - strategy="ddp_find_unused_parameters_true", -) - -torch.distributed.destroy_process_group() - -assert model.is_trained -""" - - if torch.cuda.is_available(): - # Define the file path for the temporary script in the current working directory - temp_file_path = os.path.join(save_path, "train_scvi_ddp_temp.py") - - # Write the training code to the file in the current working directory - with open(temp_file_path, "w") as temp_file: - temp_file.write(training_code) - print(f"Temporary Python file created at: {temp_file_path}") - - def launch_ddp(world_size, temp_file_path): - # Command to run the script via torchrun - command = [ - "torchrun", - "--nproc_per_node=" + str(world_size), # Specify the number of GPUs - temp_file_path, # Your original script - ] - # Use subprocess to run the command - try: - # Run the command, wait for it to finish & clean up the temporary file - subprocess.run(command, check=True) - except subprocess.CalledProcessError as e: - os.remove(temp_file_path) - print(f"Error occurred while running the DDP training: {e}") - raise - finally: - os.remove(temp_file_path) - - launch_ddp(torch.cuda.device_count(), temp_file_path) +# def test_scvi_train_ddp(save_path: str): +# training_code = """ +# import torch +# import scvi +# from scvi.model import SCVI +# +# adata = scvi.data.synthetic_iid() +# SCVI.setup_anndata(adata) +# +# model = SCVI(adata) +# +# model.train( +# max_epochs=1, +# check_val_every_n_epoch=1, +# accelerator="gpu", +# devices=-1, +# strategy="ddp_find_unused_parameters_true", +# ) +# +# torch.distributed.destroy_process_group() +# +# assert model.is_trained +# """ +# +# if torch.cuda.is_available(): +# # Define the file path for the temporary script in the current working directory +# temp_file_path = os.path.join(save_path, "train_scvi_ddp_temp.py") +# +# # Write the training code to the file in the current working directory +# with open(temp_file_path, "w") as temp_file: +# temp_file.write(training_code) +# print(f"Temporary Python file created at: {temp_file_path}") +# +# def launch_ddp(world_size, temp_file_path): +# # Command to run the script via torchrun +# command = [ +# "torchrun", +# "--nproc_per_node=" + str(world_size), # Specify the number of GPUs +# temp_file_path, # Your original script +# ] +# # Use subprocess to run the command +# try: +# # Run the command, wait for it to finish & clean up the temporary file +# subprocess.run(command, check=True) +# except subprocess.CalledProcessError as e: +# os.remove(temp_file_path) +# print(f"Error occurred while running the DDP training: {e}") +# raise +# finally: +# os.remove(temp_file_path) +# +# launch_ddp(torch.cuda.device_count(), temp_file_path) From 58c629d90377d6ee162097e0d622ea503db3888e Mon Sep 17 00:00:00 2001 From: Ori Kronfeld Date: Wed, 4 Dec 2024 00:27:40 +0200 Subject: [PATCH 31/42] revert fixes --- .github/workflows/test_linux_cuda.yml | 2 +- src/scvi/dataloaders/_ann_dataloader.py | 8 ++++---- src/scvi/dataloaders/_samplers.py | 14 +++++++------- src/scvi/model/_scanvi.py | 1 - tests/model/test_jaxscvi.py | 4 ---- tests/model/test_scvi.py | 2 +- 6 files changed, 13 insertions(+), 18 deletions(-) diff --git a/.github/workflows/test_linux_cuda.yml b/.github/workflows/test_linux_cuda.yml index d0bc164c9f..0eca4d65c8 100644 --- a/.github/workflows/test_linux_cuda.yml +++ b/.github/workflows/test_linux_cuda.yml @@ -59,7 +59,7 @@ jobs: run: | python -m pip install --upgrade pip wheel uv python -m uv pip install --system "scvi-tools[tests] @ ." - python -m pip install jax[cuda12] + python -m pip install jax[cuda] python -m pip install nvidia-nccl-cu12 - name: Run pytest diff --git a/src/scvi/dataloaders/_ann_dataloader.py b/src/scvi/dataloaders/_ann_dataloader.py index 82a13ac0b6..eca803c581 100644 --- a/src/scvi/dataloaders/_ann_dataloader.py +++ b/src/scvi/dataloaders/_ann_dataloader.py @@ -117,10 +117,6 @@ def __init__( batch_size=batch_size, drop_last=drop_last, ) - # do not touch batch size here, sampler gives batched indices - # This disables PyTorch automatic batching, which is necessary - # for fast access to sparse matrices - self.kwargs.update({"batch_size": None, "shuffle": False}) else: sampler = BatchDistributedSampler( self.dataset, @@ -130,6 +126,10 @@ def __init__( shuffle=shuffle, **kwargs, ) + # do not touch batch size here, sampler gives batched indices + # This disables PyTorch automatic batching, which is necessary + # for fast access to sparse matrices + self.kwargs.update({"batch_size": None, "shuffle": False}) self.kwargs.update({"sampler": sampler}) diff --git a/src/scvi/dataloaders/_samplers.py b/src/scvi/dataloaders/_samplers.py index 8753c6feec..667f27d699 100644 --- a/src/scvi/dataloaders/_samplers.py +++ b/src/scvi/dataloaders/_samplers.py @@ -36,13 +36,13 @@ def __init__( drop_dataset_tail: bool = False, **kwargs, ): - for redundant_key in [ - "pin_memory", - "num_workers", - "persistent_workers", - ]: - if redundant_key in kwargs: - kwargs.pop(redundant_key) + # for redundant_key in [ + # "pin_memory", + # "num_workers", + # "persistent_workers", + # ]: + # if redundant_key in kwargs: + # kwargs.pop(redundant_key) super().__init__(dataset, drop_last=drop_dataset_tail, **kwargs) self.batch_size = batch_size diff --git a/src/scvi/model/_scanvi.py b/src/scvi/model/_scanvi.py index edd75e5324..084d83be1f 100644 --- a/src/scvi/model/_scanvi.py +++ b/src/scvi/model/_scanvi.py @@ -411,7 +411,6 @@ def train( shuffle_set_split=shuffle_set_split, n_samples_per_label=n_samples_per_label, batch_size=batch_size, - # distributed_sampler=use_distributed_sampler(trainer_kwargs.get("strategy", None)), **datasplitter_kwargs, ) training_plan = self._training_plan_cls(self.module, self.n_labels, **plan_kwargs) diff --git a/tests/model/test_jaxscvi.py b/tests/model/test_jaxscvi.py index 1d6ee6b313..63e50136cb 100644 --- a/tests/model/test_jaxscvi.py +++ b/tests/model/test_jaxscvi.py @@ -12,8 +12,6 @@ def test_jax_scvi(n_latent=5): - # accelerator = "gpu" if torch.cuda.is_available() else "cpu" - adata = synthetic_iid() JaxSCVI.setup_anndata( adata, @@ -33,7 +31,6 @@ def test_jax_scvi(n_latent=5): def test_jax_scvi_training(n_latent: int = 5, dropout_rate: float = 0.1): - # accelerator = "gpu" if torch.cuda.is_available() else "cpu" adata = synthetic_iid() JaxSCVI.setup_anndata( adata, @@ -58,7 +55,6 @@ def test_jax_scvi_training(n_latent: int = 5, dropout_rate: float = 0.1): def test_jax_scvi_save_load(save_path: str, n_latent: int = 5): - # accelerator = "gpu" if torch.cuda.is_available() else "cpu" adata = synthetic_iid() JaxSCVI.setup_anndata( adata, diff --git a/tests/model/test_scvi.py b/tests/model/test_scvi.py index acbee48228..af8e5e8579 100644 --- a/tests/model/test_scvi.py +++ b/tests/model/test_scvi.py @@ -1322,7 +1322,7 @@ def test_scvi_num_workers(): # # assert model.is_trained # """ -# +# import subprocess # if torch.cuda.is_available(): # # Define the file path for the temporary script in the current working directory # temp_file_path = os.path.join(save_path, "train_scvi_ddp_temp.py") From ba6c9f0b2608dbc251bf731a640a94b80914945d Mon Sep 17 00:00:00 2001 From: Ori Kronfeld Date: Wed, 4 Dec 2024 00:43:34 +0200 Subject: [PATCH 32/42] revert fixes --- src/scvi/dataloaders/_ann_dataloader.py | 2 +- tests/model/test_jaxscvi.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/scvi/dataloaders/_ann_dataloader.py b/src/scvi/dataloaders/_ann_dataloader.py index eca803c581..04bbae6b6c 100644 --- a/src/scvi/dataloaders/_ann_dataloader.py +++ b/src/scvi/dataloaders/_ann_dataloader.py @@ -136,4 +136,4 @@ def __init__( if iter_ndarray: self.kwargs.update({"collate_fn": lambda x: x}) - super().__init__(self.dataset, drop_last=drop_dataset_tail, **self.kwargs) + super().__init__(self.dataset, **self.kwargs) diff --git a/tests/model/test_jaxscvi.py b/tests/model/test_jaxscvi.py index 63e50136cb..dac25b7c89 100644 --- a/tests/model/test_jaxscvi.py +++ b/tests/model/test_jaxscvi.py @@ -2,8 +2,6 @@ import numpy as np import pytest - -# import torch from flax import linen as nn from scvi.data import synthetic_iid From dd8641d0bf6e1bf5ae42538f167b5004fd479ee7 Mon Sep 17 00:00:00 2001 From: Ori Kronfeld Date: Wed, 4 Dec 2024 15:27:19 +0200 Subject: [PATCH 33/42] more fixes --- src/scvi/dataloaders/_ann_dataloader.py | 2 +- src/scvi/dataloaders/_concat_dataloader.py | 13 +++++++------ tests/dataloaders/test_dataloaders.py | 2 ++ tests/model/test_scanvi.py | 9 +++------ 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/scvi/dataloaders/_ann_dataloader.py b/src/scvi/dataloaders/_ann_dataloader.py index 04bbae6b6c..7860184786 100644 --- a/src/scvi/dataloaders/_ann_dataloader.py +++ b/src/scvi/dataloaders/_ann_dataloader.py @@ -124,7 +124,7 @@ def __init__( drop_last=drop_last, drop_dataset_tail=drop_dataset_tail, shuffle=shuffle, - **kwargs, + # **kwargs, ) # do not touch batch size here, sampler gives batched indices # This disables PyTorch automatic batching, which is necessary diff --git a/src/scvi/dataloaders/_concat_dataloader.py b/src/scvi/dataloaders/_concat_dataloader.py index 554f0267f0..1d041758db 100644 --- a/src/scvi/dataloaders/_concat_dataloader.py +++ b/src/scvi/dataloaders/_concat_dataloader.py @@ -53,11 +53,12 @@ def __init__( self._shuffle = shuffle self._batch_size = batch_size self._drop_last = drop_last - self._drop_dataset_tail = ( - self.dataloader_kwargs["drop_dataset_tail"] - if "drop_dataset_tail" in self.dataloader_kwargs.keys() - else False - ) + self._distributed_sampler = distributed_sampler + # self._drop_dataset_tail = ( + # self.dataloader_kwargs["drop_dataset_tail"] + # if "drop_dataset_tail" in self.dataloader_kwargs.keys() + # else False + # ) self.dataloaders = [] for indices in indices_list: @@ -75,7 +76,7 @@ def __init__( ) lens = [len(dl) for dl in self.dataloaders] self.largest_dl = self.dataloaders[np.argmax(lens)] - super().__init__(self.largest_dl, drop_last=self._drop_dataset_tail, **data_loader_kwargs) + super().__init__(self.largest_dl, **data_loader_kwargs) def __len__(self): return len(self.largest_dl) diff --git a/tests/dataloaders/test_dataloaders.py b/tests/dataloaders/test_dataloaders.py index 0c6bec9804..8b320b7aeb 100644 --- a/tests/dataloaders/test_dataloaders.py +++ b/tests/dataloaders/test_dataloaders.py @@ -164,3 +164,5 @@ def test_scanvi_with_distributed_sampler(num_processes: int, save_path: str): ) model.train(1, datasplitter_kwargs=datasplitter_kwargs) + + torch.distributed.destroy_process_group() diff --git a/tests/model/test_scanvi.py b/tests/model/test_scanvi.py index 7d8fced0d2..11349da290 100644 --- a/tests/model/test_scanvi.py +++ b/tests/model/test_scanvi.py @@ -580,7 +580,7 @@ def check_no_logits_and_softmax(model: SCANVI): check_no_logits_and_softmax(model) -# def test_scanvi_train_ddp(): +# def test_scanvi_train_ddp(save_path: str): # training_code = """ # import torch # import scvi @@ -609,13 +609,10 @@ def check_no_logits_and_softmax(model: SCANVI): # # assert model.is_trained # """ -# +# import subprocess # if torch.cuda.is_available(): -# # Get the current working directory (CWD) -# cwd = os.getcwd() -# # # Define the file path for the temporary script in the current working directory -# temp_file_path = os.path.join(cwd, "train_scanvi_ddp_temp.py") +# temp_file_path = os.path.join(save_path, "train_scanvi_ddp_temp.py") # # # Write the training code to the file in the current working directory # with open(temp_file_path, "w") as temp_file: From bbf00231bc5c03edf730b08b3d4293148a3f0257 Mon Sep 17 00:00:00 2001 From: Ori Kronfeld Date: Wed, 4 Dec 2024 18:40:45 +0200 Subject: [PATCH 34/42] more fixes --- tests/model/test_scvi.py | 105 ++++++++++++++++++++------------------- 1 file changed, 53 insertions(+), 52 deletions(-) diff --git a/tests/model/test_scvi.py b/tests/model/test_scvi.py index af8e5e8579..28e56b47d6 100644 --- a/tests/model/test_scvi.py +++ b/tests/model/test_scvi.py @@ -1299,55 +1299,56 @@ def test_scvi_num_workers(): model.get_normalized_expression(n_samples=2) -# def test_scvi_train_ddp(save_path: str): -# training_code = """ -# import torch -# import scvi -# from scvi.model import SCVI -# -# adata = scvi.data.synthetic_iid() -# SCVI.setup_anndata(adata) -# -# model = SCVI(adata) -# -# model.train( -# max_epochs=1, -# check_val_every_n_epoch=1, -# accelerator="gpu", -# devices=-1, -# strategy="ddp_find_unused_parameters_true", -# ) -# -# torch.distributed.destroy_process_group() -# -# assert model.is_trained -# """ -# import subprocess -# if torch.cuda.is_available(): -# # Define the file path for the temporary script in the current working directory -# temp_file_path = os.path.join(save_path, "train_scvi_ddp_temp.py") -# -# # Write the training code to the file in the current working directory -# with open(temp_file_path, "w") as temp_file: -# temp_file.write(training_code) -# print(f"Temporary Python file created at: {temp_file_path}") -# -# def launch_ddp(world_size, temp_file_path): -# # Command to run the script via torchrun -# command = [ -# "torchrun", -# "--nproc_per_node=" + str(world_size), # Specify the number of GPUs -# temp_file_path, # Your original script -# ] -# # Use subprocess to run the command -# try: -# # Run the command, wait for it to finish & clean up the temporary file -# subprocess.run(command, check=True) -# except subprocess.CalledProcessError as e: -# os.remove(temp_file_path) -# print(f"Error occurred while running the DDP training: {e}") -# raise -# finally: -# os.remove(temp_file_path) -# -# launch_ddp(torch.cuda.device_count(), temp_file_path) +def test_scvi_train_ddp(save_path: str): + training_code = """ +import torch +import scvi +from scvi.model import SCVI + +adata = scvi.data.synthetic_iid() +SCVI.setup_anndata(adata) + +model = SCVI(adata) + +model.train( + max_epochs=1, + check_val_every_n_epoch=1, + accelerator="gpu", + devices=-1, + strategy="ddp_find_unused_parameters_true", +) + +torch.distributed.destroy_process_group() + +assert model.is_trained +""" + import subprocess + + if torch.cuda.is_available(): + # Define the file path for the temporary script in the current working directory + temp_file_path = os.path.join(save_path, "train_scvi_ddp_temp.py") + + # Write the training code to the file in the current working directory + with open(temp_file_path, "w") as temp_file: + temp_file.write(training_code) + print(f"Temporary Python file created at: {temp_file_path}") + + def launch_ddp(world_size, temp_file_path): + # Command to run the script via torchrun + command = [ + "torchrun", + "--nproc_per_node=" + str(world_size), # Specify the number of GPUs + temp_file_path, # Your original script + ] + # Use subprocess to run the command + try: + # Run the command, wait for it to finish & clean up the temporary file + subprocess.run(command, check=True) + except subprocess.CalledProcessError as e: + os.remove(temp_file_path) + print(f"Error occurred while running the DDP training: {e}") + raise + finally: + os.remove(temp_file_path) + + launch_ddp(torch.cuda.device_count(), temp_file_path) From 3dc4b334cf9c1520a2ea3d8f58d8bceb5b48e5d9 Mon Sep 17 00:00:00 2001 From: Ori Kronfeld Date: Wed, 4 Dec 2024 18:58:10 +0200 Subject: [PATCH 35/42] revert the test which fail on runner --- tests/model/test_scvi.py | 105 +++++++++++++++++++-------------------- 1 file changed, 52 insertions(+), 53 deletions(-) diff --git a/tests/model/test_scvi.py b/tests/model/test_scvi.py index 28e56b47d6..af8e5e8579 100644 --- a/tests/model/test_scvi.py +++ b/tests/model/test_scvi.py @@ -1299,56 +1299,55 @@ def test_scvi_num_workers(): model.get_normalized_expression(n_samples=2) -def test_scvi_train_ddp(save_path: str): - training_code = """ -import torch -import scvi -from scvi.model import SCVI - -adata = scvi.data.synthetic_iid() -SCVI.setup_anndata(adata) - -model = SCVI(adata) - -model.train( - max_epochs=1, - check_val_every_n_epoch=1, - accelerator="gpu", - devices=-1, - strategy="ddp_find_unused_parameters_true", -) - -torch.distributed.destroy_process_group() - -assert model.is_trained -""" - import subprocess - - if torch.cuda.is_available(): - # Define the file path for the temporary script in the current working directory - temp_file_path = os.path.join(save_path, "train_scvi_ddp_temp.py") - - # Write the training code to the file in the current working directory - with open(temp_file_path, "w") as temp_file: - temp_file.write(training_code) - print(f"Temporary Python file created at: {temp_file_path}") - - def launch_ddp(world_size, temp_file_path): - # Command to run the script via torchrun - command = [ - "torchrun", - "--nproc_per_node=" + str(world_size), # Specify the number of GPUs - temp_file_path, # Your original script - ] - # Use subprocess to run the command - try: - # Run the command, wait for it to finish & clean up the temporary file - subprocess.run(command, check=True) - except subprocess.CalledProcessError as e: - os.remove(temp_file_path) - print(f"Error occurred while running the DDP training: {e}") - raise - finally: - os.remove(temp_file_path) - - launch_ddp(torch.cuda.device_count(), temp_file_path) +# def test_scvi_train_ddp(save_path: str): +# training_code = """ +# import torch +# import scvi +# from scvi.model import SCVI +# +# adata = scvi.data.synthetic_iid() +# SCVI.setup_anndata(adata) +# +# model = SCVI(adata) +# +# model.train( +# max_epochs=1, +# check_val_every_n_epoch=1, +# accelerator="gpu", +# devices=-1, +# strategy="ddp_find_unused_parameters_true", +# ) +# +# torch.distributed.destroy_process_group() +# +# assert model.is_trained +# """ +# import subprocess +# if torch.cuda.is_available(): +# # Define the file path for the temporary script in the current working directory +# temp_file_path = os.path.join(save_path, "train_scvi_ddp_temp.py") +# +# # Write the training code to the file in the current working directory +# with open(temp_file_path, "w") as temp_file: +# temp_file.write(training_code) +# print(f"Temporary Python file created at: {temp_file_path}") +# +# def launch_ddp(world_size, temp_file_path): +# # Command to run the script via torchrun +# command = [ +# "torchrun", +# "--nproc_per_node=" + str(world_size), # Specify the number of GPUs +# temp_file_path, # Your original script +# ] +# # Use subprocess to run the command +# try: +# # Run the command, wait for it to finish & clean up the temporary file +# subprocess.run(command, check=True) +# except subprocess.CalledProcessError as e: +# os.remove(temp_file_path) +# print(f"Error occurred while running the DDP training: {e}") +# raise +# finally: +# os.remove(temp_file_path) +# +# launch_ddp(torch.cuda.device_count(), temp_file_path) From 0a35c19e4d7b9bdfae9767d7939452ef1e776f87 Mon Sep 17 00:00:00 2001 From: Ori Kronfeld Date: Sun, 8 Dec 2024 15:01:26 +0200 Subject: [PATCH 36/42] moved ddp test to a new file --- tests/dataloaders/test_dataloaders.py | 2 +- tests/model/test_multigpu.py | 111 ++++++++++++++++++++++++++ tests/model/test_scanvi.py | 60 -------------- tests/model/test_scvi.py | 54 ------------- 4 files changed, 112 insertions(+), 115 deletions(-) create mode 100644 tests/model/test_multigpu.py diff --git a/tests/dataloaders/test_dataloaders.py b/tests/dataloaders/test_dataloaders.py index 8b320b7aeb..c9fd30f29c 100644 --- a/tests/dataloaders/test_dataloaders.py +++ b/tests/dataloaders/test_dataloaders.py @@ -103,7 +103,7 @@ def multiprocessing_worker( ): # initializes the distributed backend that takes care of synchronizing processes torch.distributed.init_process_group( - "nccl", # backend that works on all systems + "nccl", init_method=f"file://{save_path}/dist_file", rank=rank, world_size=world_size, diff --git a/tests/model/test_multigpu.py b/tests/model/test_multigpu.py new file mode 100644 index 0000000000..929caa9dc8 --- /dev/null +++ b/tests/model/test_multigpu.py @@ -0,0 +1,111 @@ +# def test_scvi_train_ddp(save_path: str): +# training_code = """ +# import torch +# import scvi +# from scvi.model import SCVI +# +# adata = scvi.data.synthetic_iid() +# SCVI.setup_anndata(adata) +# +# model = SCVI(adata) +# +# model.train( +# max_epochs=1, +# check_val_every_n_epoch=1, +# accelerator="gpu", +# devices=-1, +# strategy="ddp_find_unused_parameters_true", +# ) +# +# torch.distributed.destroy_process_group() +# +# assert model.is_trained +# """ +# import subprocess +# if torch.cuda.is_available(): +# # Define the file path for the temporary script in the current working directory +# temp_file_path = os.path.join(save_path, "train_scvi_ddp_temp.py") +# +# # Write the training code to the file in the current working directory +# with open(temp_file_path, "w") as temp_file: +# temp_file.write(training_code) +# print(f"Temporary Python file created at: {temp_file_path}") +# +# def launch_ddp(world_size, temp_file_path): +# # Command to run the script via torchrun +# command = [ +# "torchrun", +# "--nproc_per_node=" + str(world_size), # Specify the number of GPUs +# temp_file_path, # Your original script +# ] +# # Use subprocess to run the command +# try: +# # Run the command, wait for it to finish & clean up the temporary file +# subprocess.run(command, check=True) +# except subprocess.CalledProcessError as e: +# os.remove(temp_file_path) +# print(f"Error occurred while running the DDP training: {e}") +# raise +# finally: +# os.remove(temp_file_path) +# +# launch_ddp(torch.cuda.device_count(), temp_file_path) +# +# def test_scanvi_train_ddp(save_path: str): +# training_code = """ +# import torch +# import scvi +# from scvi.model import SCANVI +# +# adata = scvi.data.synthetic_iid() +# SCANVI.setup_anndata( +# adata, +# "labels", +# "label_0", +# batch_key="batch", +# ) +# +# model = SCANVI(adata, n_latent=10) +# +# model.train( +# max_epochs=100, +# train_size=0.5, +# check_val_every_n_epoch=1, +# accelerator="gpu", +# devices=-1, +# strategy="ddp_find_unused_parameters_true", +# ) +# +# torch.distributed.destroy_process_group() +# +# assert model.is_trained +# """ +# import subprocess +# if torch.cuda.is_available(): +# # Define the file path for the temporary script in the current working directory +# temp_file_path = os.path.join(save_path, "train_scanvi_ddp_temp.py") +# +# # Write the training code to the file in the current working directory +# with open(temp_file_path, "w") as temp_file: +# temp_file.write(training_code) +# print(f"Temporary Python file created at: {temp_file_path}") +# +# def launch_ddp(world_size, temp_file_path): +# # Command to run the script via torchrun +# command = [ +# "torchrun", +# "--nproc_per_node=" + str(world_size), # Specify the number of GPUs +# temp_file_path, # Your original script +# ] +# # Use subprocess to run the command +# try: +# # Run the command, wait for it to finish & clean up the temporary file +# subprocess.run(command, check=True) +# except subprocess.CalledProcessError as e: +# os.remove(temp_file_path) +# print(f"Error occurred while running the DDP training: {e}") +# raise +# finally: +# os.remove(temp_file_path) +# +# launch_ddp(torch.cuda.device_count(), temp_file_path) diff --git a/tests/model/test_scanvi.py b/tests/model/test_scanvi.py index 11349da290..aede994029 100644 --- a/tests/model/test_scanvi.py +++ b/tests/model/test_scanvi.py @@ -578,63 +578,3 @@ def check_no_logits_and_softmax(model: SCANVI): model = SCANVI.load(resave_model_path, adata) check_no_logits_and_softmax(model) - - -# def test_scanvi_train_ddp(save_path: str): -# training_code = """ -# import torch -# import scvi -# from scvi.model import SCANVI -# -# adata = scvi.data.synthetic_iid() -# SCANVI.setup_anndata( -# adata, -# "labels", -# "label_0", -# batch_key="batch", -# ) -# -# model = SCANVI(adata, n_latent=10) -# -# model.train( -# max_epochs=100, -# train_size=0.5, -# check_val_every_n_epoch=1, -# accelerator="gpu", -# devices=-1, -# strategy="ddp_find_unused_parameters_true", -# ) -# -# torch.distributed.destroy_process_group() -# -# assert model.is_trained -# """ -# import subprocess -# if torch.cuda.is_available(): -# # Define the file path for the temporary script in the current working directory -# temp_file_path = os.path.join(save_path, "train_scanvi_ddp_temp.py") -# -# # Write the training code to the file in the current working directory -# with open(temp_file_path, "w") as temp_file: -# temp_file.write(training_code) -# print(f"Temporary Python file created at: {temp_file_path}") -# -# def launch_ddp(world_size, temp_file_path): -# # Command to run the script via torchrun -# command = [ -# "torchrun", -# "--nproc_per_node=" + str(world_size), # Specify the number of GPUs -# temp_file_path, # Your original script -# ] -# # Use subprocess to run the command -# try: -# # Run the command, wait for it to finish & clean up the temporary file -# subprocess.run(command, check=True) -# except subprocess.CalledProcessError as e: -# os.remove(temp_file_path) -# print(f"Error occurred while running the DDP training: {e}") -# raise -# finally: -# os.remove(temp_file_path) -# -# launch_ddp(torch.cuda.device_count(), temp_file_path) diff --git a/tests/model/test_scvi.py b/tests/model/test_scvi.py index af8e5e8579..49fe18e531 100644 --- a/tests/model/test_scvi.py +++ b/tests/model/test_scvi.py @@ -1297,57 +1297,3 @@ def test_scvi_num_workers(): model.get_reconstruction_error() model.get_normalized_expression(transform_batch="batch_1") model.get_normalized_expression(n_samples=2) - - -# def test_scvi_train_ddp(save_path: str): -# training_code = """ -# import torch -# import scvi -# from scvi.model import SCVI -# -# adata = scvi.data.synthetic_iid() -# SCVI.setup_anndata(adata) -# -# model = SCVI(adata) -# -# model.train( -# max_epochs=1, -# check_val_every_n_epoch=1, -# accelerator="gpu", -# devices=-1, -# strategy="ddp_find_unused_parameters_true", -# ) -# -# torch.distributed.destroy_process_group() -# -# assert model.is_trained -# """ -# import subprocess -# if torch.cuda.is_available(): -# # Define the file path for the temporary script in the current working directory -# temp_file_path = os.path.join(save_path, "train_scvi_ddp_temp.py") -# -# # Write the training code to the file in the current working directory -# with open(temp_file_path, "w") as temp_file: -# temp_file.write(training_code) -# print(f"Temporary Python file created at: {temp_file_path}") -# -# def launch_ddp(world_size, temp_file_path): -# # Command to run the script via torchrun -# command = [ -# "torchrun", -# "--nproc_per_node=" + str(world_size), # Specify the number of GPUs -# temp_file_path, # Your original script -# ] -# # Use subprocess to run the command -# try: -# # Run the command, wait for it to finish & clean up the temporary file -# subprocess.run(command, check=True) -# except subprocess.CalledProcessError as e: -# os.remove(temp_file_path) -# print(f"Error occurred while running the DDP training: {e}") -# raise -# finally: -# os.remove(temp_file_path) -# -# launch_ddp(torch.cuda.device_count(), temp_file_path) From f52bdd50a4244266522d7bbb07be1cc7b6a9d192 Mon Sep 17 00:00:00 2001 From: Ori Kronfeld Date: Mon, 9 Dec 2024 12:13:20 +0200 Subject: [PATCH 37/42] Added a multi GPU test flag --- .github/workflows/test_linux_multigpu.yml | 80 ++++++++ src/scvi/dataloaders/_ann_dataloader.py | 1 - src/scvi/dataloaders/_concat_dataloader.py | 5 - src/scvi/dataloaders/_samplers.py | 8 - tests/conftest.py | 23 ++- tests/dataloaders/test_dataloaders.py | 88 ++++---- tests/model/test_multigpu.py | 228 +++++++++++---------- 7 files changed, 261 insertions(+), 172 deletions(-) create mode 100644 .github/workflows/test_linux_multigpu.yml diff --git a/.github/workflows/test_linux_multigpu.yml b/.github/workflows/test_linux_multigpu.yml new file mode 100644 index 0000000000..103b49180f --- /dev/null +++ b/.github/workflows/test_linux_multigpu.yml @@ -0,0 +1,80 @@ +name: test (multi-GPU) + +on: + pull_request: + branches: [main, "[0-9]+.[0-9]+.x"] + types: [labeled, synchronize, opened] + schedule: + - cron: "0 10 * * *" # runs at 10:00 UTC (03:00 PST) every day + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + test: + # if PR has label "multiGPU tests" or "all tests" or if scheduled or manually triggered or on push + if: >- + ( + contains(github.event.pull_request.labels.*.name, 'multiGPU tests') || + contains(github.event.pull_request.labels.*.name, 'all tests') || + contains(github.event_name, 'schedule') || + contains(github.event_name, 'workflow_dispatch') || + contains(github.event_name, 'push') + ) + + runs-on: [self-hosted, Linux, X64, CUDA] + + defaults: + run: + shell: bash -e {0} # -e to fail on error + + container: + image: ghcr.io/scverse/scvi-tools:py3.12-cu12-base + options: --user root --gpus all --pull always + + # strategy: + # fail-fast: false + # matrix: + # os: [ubuntu-latest] + # python: ["3.12"] + + permissions: + id-token: write + + name: unit + + env: + OS: ${{ matrix.os }} + PYTHON: ${{ matrix.python }} + + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python }} + cache: "pip" + cache-dependency-path: "**/pyproject.toml" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip wheel uv + python -m uv pip install --system "scvi-tools[tests] @ ." + python -m pip install jax[cuda] + python -m pip install nvidia-nccl-cu12 + + - name: Run pytest + env: + MPLBACKEND: agg + PLATFORM: ${{ matrix.os }} + DISPLAY: :42 + COLUMNS: 120 + run: | + coverage run -m pytest -v --color=yes --multigpu-tests --accelerator cuda --devices auto + coverage report + + - uses: codecov/codecov-action@v4 + with: + token: ${{ secrets.CODECOV_TOKEN }} diff --git a/src/scvi/dataloaders/_ann_dataloader.py b/src/scvi/dataloaders/_ann_dataloader.py index 7860184786..27e17302d5 100644 --- a/src/scvi/dataloaders/_ann_dataloader.py +++ b/src/scvi/dataloaders/_ann_dataloader.py @@ -124,7 +124,6 @@ def __init__( drop_last=drop_last, drop_dataset_tail=drop_dataset_tail, shuffle=shuffle, - # **kwargs, ) # do not touch batch size here, sampler gives batched indices # This disables PyTorch automatic batching, which is necessary diff --git a/src/scvi/dataloaders/_concat_dataloader.py b/src/scvi/dataloaders/_concat_dataloader.py index 1d041758db..fdcdea4aa8 100644 --- a/src/scvi/dataloaders/_concat_dataloader.py +++ b/src/scvi/dataloaders/_concat_dataloader.py @@ -54,11 +54,6 @@ def __init__( self._batch_size = batch_size self._drop_last = drop_last self._distributed_sampler = distributed_sampler - # self._drop_dataset_tail = ( - # self.dataloader_kwargs["drop_dataset_tail"] - # if "drop_dataset_tail" in self.dataloader_kwargs.keys() - # else False - # ) self.dataloaders = [] for indices in indices_list: diff --git a/src/scvi/dataloaders/_samplers.py b/src/scvi/dataloaders/_samplers.py index 667f27d699..283b2586e2 100644 --- a/src/scvi/dataloaders/_samplers.py +++ b/src/scvi/dataloaders/_samplers.py @@ -36,14 +36,6 @@ def __init__( drop_dataset_tail: bool = False, **kwargs, ): - # for redundant_key in [ - # "pin_memory", - # "num_workers", - # "persistent_workers", - # ]: - # if redundant_key in kwargs: - # kwargs.pop(redundant_key) - super().__init__(dataset, drop_last=drop_dataset_tail, **kwargs) self.batch_size = batch_size self.drop_last_batch = drop_last # drop_last already defined in parent diff --git a/tests/conftest.py b/tests/conftest.py index aac511cefc..3a6e942c7a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,7 +1,6 @@ import shutil import pytest -from distutils.dir_util import copy_tree import scvi from tests.data.utils import generic_setup_adata_manager @@ -15,6 +14,12 @@ def pytest_addoption(parser): default=False, help="Run tests that retrieve stuff from the internet. This increases test time.", ) + parser.addoption( + "--multigpu-tests", + action="store_true", + default=False, + help="Run tests that are desinged for multiGPU.", + ) parser.addoption( "--optional", action="store_true", @@ -62,7 +67,7 @@ def pytest_collection_modifyitems(config, items): # `--internet-tests` passed if not run_internet and ("internet" in item.keywords): item.add_marker(skip_internet) - # Skip all tests not marked with `pytest.mark.internet` if `--internet` passed + # Skip all tests not marked with `pytest.mark.internet` if `--internet-tests` passed elif run_internet and ("internet" not in item.keywords): item.add_marker(skip_non_internet) @@ -90,13 +95,25 @@ def pytest_collection_modifyitems(config, items): elif run_private and ("private" not in item.keywords): item.add_marker(skip_non_private) + run_multigpu = config.getoption("--multigpu-tests") + skip_multigpu = pytest.mark.skip(reason="need --multigpu-tests option to run") + skip_non_multigpu = pytest.mark.skip(reason="test not having a pytest.mark.multigpu decorator") + for item in items: + # All tests marked with `pytest.mark.multigpu` get skipped unless + # `--multigpu-tests` passed + if not run_multigpu and ("multigpu" in item.keywords): + item.add_marker(skip_multigpu) + # Skip all tests not marked with `pytest.mark.multigpu` if `--multigpu-tests` passed + elif run_multigpu and ("multigpu" not in item.keywords): + item.add_marker(skip_non_multigpu) + @pytest.fixture(scope="session") def save_path(tmp_path_factory): """Docstring for save_path.""" dir = tmp_path_factory.mktemp("temp_data", numbered=False) path = str(dir) - copy_tree("tests/test_data", path) + shutil.copy_tree("tests/test_data", path) yield path + "/" shutil.rmtree(str(tmp_path_factory.getbasetemp())) diff --git a/tests/dataloaders/test_dataloaders.py b/tests/dataloaders/test_dataloaders.py index c9fd30f29c..c1e96c6786 100644 --- a/tests/dataloaders/test_dataloaders.py +++ b/tests/dataloaders/test_dataloaders.py @@ -115,54 +115,54 @@ def multiprocessing_worker( return +@pytest.mark.multigpu @pytest.mark.parametrize("num_processes", [1, 2]) def test_anndataloader_distributed_sampler(num_processes: int, save_path: str): - if torch.cuda.is_available(): - adata = scvi.data.synthetic_iid() - manager = generic_setup_adata_manager(adata) - - file_path = save_path + "/dist_file" - if os.path.exists(file_path): # Check if the file exists - os.remove(file_path) - - torch.multiprocessing.spawn( - multiprocessing_worker, - args=(num_processes, manager, save_path, {}), - nprocs=num_processes, - join=True, - ) + adata = scvi.data.synthetic_iid() + manager = generic_setup_adata_manager(adata) + file_path = save_path + "/dist_file" + if os.path.exists(file_path): # Check if the file exists + os.remove(file_path) + + torch.multiprocessing.spawn( + multiprocessing_worker, + args=(num_processes, manager, save_path, {}), + nprocs=num_processes, + join=True, + ) -@pytest.mark.parametrize("num_processes", [1]) + +@pytest.mark.multigpu +@pytest.mark.parametrize("num_processes", [1, 2]) def test_scanvi_with_distributed_sampler(num_processes: int, save_path: str): - if torch.cuda.is_available(): - adata = scvi.data.synthetic_iid() - SCANVI.setup_anndata( - adata, - "labels", - "label_0", - batch_key="batch", - ) - file_path = save_path + "/dist_file" - if os.path.exists(file_path): # Check if the file exists - os.remove(file_path) - datasplitter_kwargs = {} - # Multi-GPU settings - datasplitter_kwargs["distributed_sampler"] = True - datasplitter_kwargs["drop_last"] = False - if num_processes == 1: - datasplitter_kwargs["distributed_sampler"] = False - model = SCANVI(adata, n_latent=10) - - # initializes the distributed backend that takes care of synchronizing processes - torch.distributed.init_process_group( - "nccl", # backend that works on all systems - init_method=f"file://{save_path}/dist_file", - rank=0, - world_size=num_processes, - store=None, - ) + adata = scvi.data.synthetic_iid() + SCANVI.setup_anndata( + adata, + "labels", + "label_0", + batch_key="batch", + ) + file_path = save_path + "/dist_file" + if os.path.exists(file_path): # Check if the file exists + os.remove(file_path) + datasplitter_kwargs = {} + # Multi-GPU settings + datasplitter_kwargs["distributed_sampler"] = True + datasplitter_kwargs["drop_last"] = False + if num_processes == 1: + datasplitter_kwargs["distributed_sampler"] = False + model = SCANVI(adata, n_latent=10) + + # initializes the distributed backend that takes care of synchronizing processes + torch.distributed.init_process_group( + "nccl", # backend that works on all systems + init_method=f"file://{save_path}/dist_file", + rank=0, + world_size=num_processes, + store=None, + ) - model.train(1, datasplitter_kwargs=datasplitter_kwargs) + model.train(1, datasplitter_kwargs=datasplitter_kwargs) - torch.distributed.destroy_process_group() + torch.distributed.destroy_process_group() diff --git a/tests/model/test_multigpu.py b/tests/model/test_multigpu.py index 929caa9dc8..301502a27e 100644 --- a/tests/model/test_multigpu.py +++ b/tests/model/test_multigpu.py @@ -1,111 +1,117 @@ -# def test_scvi_train_ddp(save_path: str): -# training_code = """ -# import torch -# import scvi -# from scvi.model import SCVI -# -# adata = scvi.data.synthetic_iid() -# SCVI.setup_anndata(adata) -# -# model = SCVI(adata) -# -# model.train( -# max_epochs=1, -# check_val_every_n_epoch=1, -# accelerator="gpu", -# devices=-1, -# strategy="ddp_find_unused_parameters_true", -# ) -# -# torch.distributed.destroy_process_group() -# -# assert model.is_trained -# """ -# import subprocess -# if torch.cuda.is_available(): -# # Define the file path for the temporary script in the current working directory -# temp_file_path = os.path.join(save_path, "train_scvi_ddp_temp.py") -# -# # Write the training code to the file in the current working directory -# with open(temp_file_path, "w") as temp_file: -# temp_file.write(training_code) -# print(f"Temporary Python file created at: {temp_file_path}") -# -# def launch_ddp(world_size, temp_file_path): -# # Command to run the script via torchrun -# command = [ -# "torchrun", -# "--nproc_per_node=" + str(world_size), # Specify the number of GPUs -# temp_file_path, # Your original script -# ] -# # Use subprocess to run the command -# try: -# # Run the command, wait for it to finish & clean up the temporary file -# subprocess.run(command, check=True) -# except subprocess.CalledProcessError as e: -# os.remove(temp_file_path) -# print(f"Error occurred while running the DDP training: {e}") -# raise -# finally: -# os.remove(temp_file_path) -# -# launch_ddp(torch.cuda.device_count(), temp_file_path) -# -# def test_scanvi_train_ddp(save_path: str): -# training_code = """ -# import torch -# import scvi -# from scvi.model import SCANVI -# -# adata = scvi.data.synthetic_iid() -# SCANVI.setup_anndata( -# adata, -# "labels", -# "label_0", -# batch_key="batch", -# ) -# -# model = SCANVI(adata, n_latent=10) -# -# model.train( -# max_epochs=100, -# train_size=0.5, -# check_val_every_n_epoch=1, -# accelerator="gpu", -# devices=-1, -# strategy="ddp_find_unused_parameters_true", -# ) -# -# torch.distributed.destroy_process_group() -# -# assert model.is_trained -# """ -# import subprocess -# if torch.cuda.is_available(): -# # Define the file path for the temporary script in the current working directory -# temp_file_path = os.path.join(save_path, "train_scanvi_ddp_temp.py") -# -# # Write the training code to the file in the current working directory -# with open(temp_file_path, "w") as temp_file: -# temp_file.write(training_code) -# print(f"Temporary Python file created at: {temp_file_path}") -# -# def launch_ddp(world_size, temp_file_path): -# # Command to run the script via torchrun -# command = [ -# "torchrun", -# "--nproc_per_node=" + str(world_size), # Specify the number of GPUs -# temp_file_path, # Your original script -# ] -# # Use subprocess to run the command -# try: -# # Run the command, wait for it to finish & clean up the temporary file -# subprocess.run(command, check=True) -# except subprocess.CalledProcessError as e: -# os.remove(temp_file_path) -# print(f"Error occurred while running the DDP training: {e}") -# raise -# finally: -# os.remove(temp_file_path) -# -# launch_ddp(torch.cuda.device_count(), temp_file_path) +import os +import subprocess + +import pytest +import torch + + +@pytest.mark.multigpu +def test_scvi_train_ddp(save_path: str): + training_code = """ +import torch +import scvi +from scvi.model import SCVI + +adata = scvi.data.synthetic_iid() +SCVI.setup_anndata(adata) + +model = SCVI(adata) + +model.train( + max_epochs=1, + check_val_every_n_epoch=1, + accelerator="gpu", + devices=-1, + strategy="ddp_find_unused_parameters_true", +) + +torch.distributed.destroy_process_group() + +assert model.is_trained +""" + # Define the file path for the temporary script in the current working directory + temp_file_path = os.path.join(save_path, "train_scvi_ddp_temp.py") + + # Write the training code to the file in the current working directory + with open(temp_file_path, "w") as temp_file: + temp_file.write(training_code) + print(f"Temporary Python file created at: {temp_file_path}") + + def launch_ddp(world_size, temp_file_path): + # Command to run the script via torchrun + command = [ + "torchrun", + "--nproc_per_node=" + str(world_size), # Specify the number of GPUs + temp_file_path, # Your original script + ] + # Use subprocess to run the command + try: + # Run the command, wait for it to finish & clean up the temporary file + subprocess.run(command, check=True) + except subprocess.CalledProcessError as e: + os.remove(temp_file_path) + print(f"Error occurred while running the DDP training: {e}") + raise + finally: + os.remove(temp_file_path) + + launch_ddp(torch.cuda.device_count(), temp_file_path) + + +@pytest.mark.multigpu +def test_scanvi_train_ddp(save_path: str): + training_code = """ +import torch +import scvi +from scvi.model import SCANVI + +adata = scvi.data.synthetic_iid() +SCANVI.setup_anndata( + adata, + "labels", + "label_0", + batch_key="batch", +) + +model = SCANVI(adata, n_latent=10) + +model.train( + max_epochs=100, + train_size=0.5, + check_val_every_n_epoch=1, + accelerator="gpu", + devices=-1, + strategy="ddp_find_unused_parameters_true", +) + +torch.distributed.destroy_process_group() + +assert model.is_trained +""" + # Define the file path for the temporary script in the current working directory + temp_file_path = os.path.join(save_path, "train_scanvi_ddp_temp.py") + + # Write the training code to the file in the current working directory + with open(temp_file_path, "w") as temp_file: + temp_file.write(training_code) + print(f"Temporary Python file created at: {temp_file_path}") + + def launch_ddp(world_size, temp_file_path): + # Command to run the script via torchrun + command = [ + "torchrun", + "--nproc_per_node=" + str(world_size), # Specify the number of GPUs + temp_file_path, # Your original script + ] + # Use subprocess to run the command + try: + # Run the command, wait for it to finish & clean up the temporary file + subprocess.run(command, check=True) + except subprocess.CalledProcessError as e: + os.remove(temp_file_path) + print(f"Error occurred while running the DDP training: {e}") + raise + finally: + os.remove(temp_file_path) + + launch_ddp(torch.cuda.device_count(), temp_file_path) From 2641284d2345ae9c47e5e2a1563a47c6f22b0ec7 Mon Sep 17 00:00:00 2001 From: Ori Kronfeld Date: Mon, 9 Dec 2024 12:22:34 +0200 Subject: [PATCH 38/42] Added changelog --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index de569f0b19..66df2a0a58 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,8 +27,12 @@ to [Semantic Versioning]. Full commit history is available in the #### Fixed +- Fixed bug in distributed `scvi.dataloaders._concat_dataloader` {pr}`3053`. + #### Changed +- Updated the CI workflow with internet, private, optional and multiGPU tests {pr}`3082`. + #### Removed ### 1.2.1 (2024-12-04) From 667895080120c33abf120a9c4a8a024a222a83af Mon Sep 17 00:00:00 2001 From: Ori Kronfeld Date: Mon, 9 Dec 2024 12:28:54 +0200 Subject: [PATCH 39/42] added changelog --- CHANGELOG.md | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 66df2a0a58..f8a74e4dcf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,14 +10,18 @@ to [Semantic Versioning]. Full commit history is available in the #### Added +- Add {class}`scvi.external.Decipher` for dimensionality reduction and interpretable + representation learning in single-cell RNA sequencing data {pr}`3015`. + #### Fixed +- Fixed bug in distributed `scvi.dataloaders._concat_dataloader` {pr}`3053`. + #### Changed -#### Removed +- Updated the CI workflow with multiGPU tests {pr}`3053`. -- Add {class}`scvi.external.Decipher` for dimensionality reduction and interpretable - representation learning in single-cell RNA sequencing data {pr}`3015`. +#### Removed ### 1.2.2 (2024-XX-XX) @@ -27,11 +31,9 @@ to [Semantic Versioning]. Full commit history is available in the #### Fixed -- Fixed bug in distributed `scvi.dataloaders._concat_dataloader` {pr}`3053`. - #### Changed -- Updated the CI workflow with internet, private, optional and multiGPU tests {pr}`3082`. +- Updated the CI workflow with internet, private and optional tests {pr}`3082`. #### Removed From 12f4767edb826dfd6dd8ff9e31c5b7f88728250e Mon Sep 17 00:00:00 2001 From: Ori Kronfeld Date: Mon, 9 Dec 2024 12:36:25 +0200 Subject: [PATCH 40/42] fix in multigpu tests --- .github/workflows/test_linux_multigpu.yml | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/.github/workflows/test_linux_multigpu.yml b/.github/workflows/test_linux_multigpu.yml index 103b49180f..1b3fa109aa 100644 --- a/.github/workflows/test_linux_multigpu.yml +++ b/.github/workflows/test_linux_multigpu.yml @@ -1,6 +1,8 @@ name: test (multi-GPU) on: + push: + branches: [main, "[0-9]+.[0-9]+.x"] #this is new pull_request: branches: [main, "[0-9]+.[0-9]+.x"] types: [labeled, synchronize, opened] @@ -20,8 +22,7 @@ jobs: contains(github.event.pull_request.labels.*.name, 'multiGPU tests') || contains(github.event.pull_request.labels.*.name, 'all tests') || contains(github.event_name, 'schedule') || - contains(github.event_name, 'workflow_dispatch') || - contains(github.event_name, 'push') + contains(github.event_name, 'workflow_dispatch') ) runs-on: [self-hosted, Linux, X64, CUDA] @@ -34,16 +35,7 @@ jobs: image: ghcr.io/scverse/scvi-tools:py3.12-cu12-base options: --user root --gpus all --pull always - # strategy: - # fail-fast: false - # matrix: - # os: [ubuntu-latest] - # python: ["3.12"] - - permissions: - id-token: write - - name: unit + name: integration env: OS: ${{ matrix.os }} From 8ade2529607348c37c329badac8ba710f41f3d62 Mon Sep 17 00:00:00 2001 From: Ori Kronfeld Date: Mon, 9 Dec 2024 12:46:20 +0200 Subject: [PATCH 41/42] fix in conftest --- tests/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/conftest.py b/tests/conftest.py index 3a6e942c7a..329c88439c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -113,7 +113,7 @@ def save_path(tmp_path_factory): """Docstring for save_path.""" dir = tmp_path_factory.mktemp("temp_data", numbered=False) path = str(dir) - shutil.copy_tree("tests/test_data", path) + shutil.copytree("tests/test_data", path) yield path + "/" shutil.rmtree(str(tmp_path_factory.getbasetemp())) From 2f780bb70dcd1348c2eea5cf824241410e8c316a Mon Sep 17 00:00:00 2001 From: Ori Kronfeld Date: Mon, 9 Dec 2024 12:49:56 +0200 Subject: [PATCH 42/42] revert thing with shutil --- tests/conftest.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/conftest.py b/tests/conftest.py index 329c88439c..6ef9467efc 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,6 +1,7 @@ import shutil import pytest +from distutils.dir_util import copy_tree import scvi from tests.data.utils import generic_setup_adata_manager @@ -113,7 +114,7 @@ def save_path(tmp_path_factory): """Docstring for save_path.""" dir = tmp_path_factory.mktemp("temp_data", numbered=False) path = str(dir) - shutil.copytree("tests/test_data", path) + copy_tree("tests/test_data", path) yield path + "/" shutil.rmtree(str(tmp_path_factory.getbasetemp()))