From c6f972573a381e9606052492d261c41675445b25 Mon Sep 17 00:00:00 2001 From: Simon Perkins Date: Tue, 15 Oct 2024 15:41:11 +0200 Subject: [PATCH 1/6] Introduce a partition_chunking argument into the MSv2 open_datatree method --- tests/test_backend.py | 11 +++++++---- xarray_ms/backend/msv2/entrypoint.py | 17 ++++++++++++++--- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/tests/test_backend.py b/tests/test_backend.py index f472f75..031d367 100644 --- a/tests/test_backend.py +++ b/tests/test_backend.py @@ -156,7 +156,7 @@ def test_open_datatree(simmed_ms): # Works with default dask scheduler with ExitStack() as stack: - dt = open_datatree(simmed_ms, chunks=chunks) + dt = open_datatree(simmed_ms, partition_chunks=chunks) for ds in dt.values(): del ds.attrs["creation_date"] xt.assert_identical(dt, mem_dt) @@ -165,7 +165,7 @@ def test_open_datatree(simmed_ms): with ExitStack() as stack: cluster = stack.enter_context(LocalCluster(processes=True, n_workers=4)) stack.enter_context(Client(cluster)) - dt = open_datatree(simmed_ms, chunks=chunks) + dt = open_datatree(simmed_ms, partition_chunks=chunks) for ds in dt.values(): del ds.attrs["creation_date"] xt.assert_identical(dt, mem_dt) @@ -186,7 +186,7 @@ def test_open_datatree_chunking(simmed_ms): and partition-specific chunking""" dt = open_datatree( simmed_ms, - chunks={"time": 3, "frequency": 2}, + partition_chunks={"time": 3, "frequency": 2}, ) for child in dt.children: @@ -210,7 +210,10 @@ def test_open_datatree_chunking(simmed_ms): dt = open_datatree( simmed_ms, - chunks={"D=0": {"time": 2, "baseline": 2}, "D=1": {"time": 3, "frequency": 2}}, + partition_chunks={ + "D=0": {"time": 2, "baseline": 2}, + "D=1": {"time": 3, "frequency": 2}, + }, ) for child in dt.children: diff --git a/xarray_ms/backend/msv2/entrypoint.py b/xarray_ms/backend/msv2/entrypoint.py index 35f8ee9..9764f92 100644 --- a/xarray_ms/backend/msv2/entrypoint.py +++ b/xarray_ms/backend/msv2/entrypoint.py @@ -298,7 +298,7 @@ def open_datatree( self, filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore, *, - chunks: Dict[str, Any] | None = None, + partition_chunks: Dict[str, Any] | None = None, drop_variables: str | Iterable[str] | None = None, partition_columns: List[str] | None = None, auto_corrs: bool = True, @@ -311,7 +311,7 @@ def open_datatree( Args: filename_or_obj: The path to the MSv2 CASA Measurement Set file. - chunks: Chunk sizes along each dimension, + partition_chunks: Chunk sizes along each dimension, e.g. :code:`{{"time": 10, "frequency": 16}}`. Individual partitions can be chunked differently by partially (or fully) specifying a partition key: e.g. @@ -331,6 +331,11 @@ def open_datatree( "D=0,F=1": {{"time": 20, "frequency": 32}}, }} + .. note:: This argument overrides the reserved ``chunks`` argument + used by xarray to control chunking in Datasets and DataTrees. + It should be used instead of ``chunks`` when different + chunking is desired for different partitions. + drop_variables: Variables to drop from the dataset. partition_columns: The columns to use for partitioning the Measurement set. Defaults to :code:`{DEFAULT_PARTITION_COLUMNS}`. @@ -355,7 +360,13 @@ def open_datatree( structure = structure_factory() datasets = {} - pchunks = promote_chunks(structure, chunks) + + if not partition_chunks: + partition_chunks = kwargs.pop("chunks", None) + elif "chunks" in kwargs: + warnings.warn("`partition_chunks` overriding `chunks`") + + pchunks = promote_chunks(structure, partition_chunks) for partition_key in structure: ds = xarray.open_dataset( From 9a171ea43618c6b60db163c8a323db44bda60cc2 Mon Sep 17 00:00:00 2001 From: Simon Perkins Date: Tue, 15 Oct 2024 15:47:57 +0200 Subject: [PATCH 2/6] Update documentation --- doc/source/tutorial.rst | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/doc/source/tutorial.rst b/doc/source/tutorial.rst index b134ff5..aabed49 100644 --- a/doc/source/tutorial.rst +++ b/doc/source/tutorial.rst @@ -102,18 +102,19 @@ Per-partition chunking ++++++++++++++++++++++ Different chunking may be desired, especially when applied to -different channelisation and polarisation configurations - +different channelisation and polarisation configurations. +In these cases, the ``partition_chunks`` argument can be used +to specify different chunking setups for each partition. .. ipython:: python dt = open_datatree(ms, partition_columns=[ "DATA_DESC_ID", "FIELD_ID", "OBSERVATION_ID"], - chunks={ + partition_chunks={ (("DATA_DESC_ID", 0),): {"time": 2, "frequency": 4}, (("DATA_DESC_ID", 1),): {"time": 3, "frequency": 2}}) -See the ``chunks`` argument of +See the ``partition_chunks`` argument of :meth:`xarray_ms.backend.msv2.entrypoint.MSv2PartitionEntryPoint.open_datatree` for more information. @@ -138,7 +139,7 @@ this to a zarr_ store. dt = open_datatree(ms, partition_columns=[ "DATA_DESC_ID", "FIELD_ID", "OBSERVATION_ID"], - chunks={ + partition_chunks={ (("DATA_DESC_ID", 0),): {"time": 2, "frequency": 4}, (("DATA_DESC_ID", 1),): {"time": 3, "frequency": 2}}) From 5ba100fda5fa9b9785e5f23aaba54fa9cacba087 Mon Sep 17 00:00:00 2001 From: Simon Perkins Date: Tue, 15 Oct 2024 15:50:30 +0200 Subject: [PATCH 3/6] Update README.rst --- README.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index ddc8419..e5bd8b0 100644 --- a/README.rst +++ b/README.rst @@ -24,7 +24,7 @@ to be developed on well-understood MSv2 data. >>> import xarray_ms >>> from xarray.backends.api import datatree >>> dt = open_datatree("/data/L795830_SB001_uv.MS/", - chunks={"time": 2000, "baseline": 1000}) + partition_chunks={"time": 2000, "baseline": 1000}) >>> dt Group: / @@ -47,7 +47,7 @@ to be developed on well-understood MSv2 data. │ VISIBILITY (time, baseline, frequency, polarization) complex64 41GB ... │ WEIGHT (time, baseline, frequency, polarization) float32 20GB ... │ Attributes: - │ version: 0.0.1 + │ version: 4.0.0 │ creation_date: 2024-09-18T10:49:55.133908+00:00 │ data_description_id: 0 └── Group: /DATA_DESC_ID=0,FIELD_ID=0,OBSERVATION_ID=0/ANTENNA From 6dc3e62de39dc3cfaad44d109290e5120d431f09 Mon Sep 17 00:00:00 2001 From: Simon Perkins Date: Tue, 15 Oct 2024 16:13:48 +0200 Subject: [PATCH 4/6] Ensure chunks is not passed in kwargs when partition_chunks is available --- xarray_ms/backend/msv2/entrypoint.py | 1 + 1 file changed, 1 insertion(+) diff --git a/xarray_ms/backend/msv2/entrypoint.py b/xarray_ms/backend/msv2/entrypoint.py index 9764f92..becb5e5 100644 --- a/xarray_ms/backend/msv2/entrypoint.py +++ b/xarray_ms/backend/msv2/entrypoint.py @@ -364,6 +364,7 @@ def open_datatree( if not partition_chunks: partition_chunks = kwargs.pop("chunks", None) elif "chunks" in kwargs: + kwargs.pop("chunks", None) warnings.warn("`partition_chunks` overriding `chunks`") pchunks = promote_chunks(structure, partition_chunks) From 5330dad838a4e3cc75b4875de5f49df155a73720 Mon Sep 17 00:00:00 2001 From: Simon Perkins Date: Tue, 15 Oct 2024 16:18:46 +0200 Subject: [PATCH 5/6] Add a test case --- tests/test_backend.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/test_backend.py b/tests/test_backend.py index 031d367..a08d1d4 100644 --- a/tests/test_backend.py +++ b/tests/test_backend.py @@ -234,3 +234,13 @@ def test_open_datatree_chunking(simmed_ms): "polarization": (2,), "uvw_label": (3,), } + + with pytest.warns(UserWarning, match="`partition_chunks` overriding `chunks`"): + dt = open_datatree( + simmed_ms, + chunks={}, + partition_chunks={ + "D=0": {"time": 2, "baseline": 2}, + "D=1": {"time": 3, "frequency": 2}, + }, + ) From 1145e9d668352cfb70e21c03b74b4e89a3a71112 Mon Sep 17 00:00:00 2001 From: Simon Perkins Date: Tue, 15 Oct 2024 16:20:04 +0200 Subject: [PATCH 6/6] [skip ci] Update changelog --- doc/source/changelog.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/changelog.rst b/doc/source/changelog.rst index a038290..5180e76 100644 --- a/doc/source/changelog.rst +++ b/doc/source/changelog.rst @@ -5,6 +5,8 @@ Changelog X.Y.Z (DD-MM-YYYY) ------------------ +* Move ``chunks`` kwarg functionality in MSv2PartitionEntryPoint.open_datatree + to ``partition_chunks`` (:pr:`35`) * Set MSv4 version to 4.0.0 (:pr:`34`) * Fix changelog highlighting in install instructions (:pr:`33`) * Add basic read tests (:pr:`32`)