pydata · dcherian · Aug 7, 2024 · Aug 7, 2024 · Aug 7, 2024 · Aug 7, 2024
diff --git a/doc/api.rst b/doc/api.rst
@@ -250,6 +250,7 @@ Reshaping and reorganizing
    Dataset.roll
    Dataset.pad
    Dataset.sortby
+   Dataset.shuffle_by
    Dataset.broadcast_like
 
 DataArray
@@ -590,6 +591,7 @@ Reshaping and reorganizing
    DataArray.roll
    DataArray.pad
    DataArray.sortby
+   DataArray.shuffle_by
    DataArray.broadcast_like
 
 DataTree
@@ -1096,6 +1098,7 @@ Dataset
    DatasetGroupBy.var
    DatasetGroupBy.dims
    DatasetGroupBy.groups
+   DatasetGroupBy.shuffle
 
 DataArray
 ---------
@@ -1127,6 +1130,7 @@ DataArray
    DataArrayGroupBy.var
    DataArrayGroupBy.dims
    DataArrayGroupBy.groups
+   DataArrayGroupBy.shuffle
 
 Grouper Objects
 ---------------

diff --git a/doc/user-guide/groupby.rst b/doc/user-guide/groupby.rst
@@ -321,3 +321,41 @@ Different groupers can be combined to construct sophisticated GroupBy operations
     from xarray.groupers import BinGrouper
 
     ds.groupby(x=BinGrouper(bins=[5, 15, 25]), letters=UniqueGrouper()).sum()
+
+
+Shuffling
+~~~~~~~~~
+
+Shuffling is a generalization of sorting a DataArray or Dataset by another DataArray, named ``label`` for example, that follows from the idea of grouping by ``label``.
+Shuffling reorders the DataArray or the DataArrays in a Dataset such that all members of a group occur sequentially. For example,
+
+.. ipython:: python
+
+    da = xr.DataArray(
+        dims="x",
+        data=[1, 2, 3, 4, 5, 6],
+        coords={"label": ("x", "a b c a b c".split(" "))},
+    )
+    da.shuffle_by("label")
+
+
+:py:meth:`Dataset.shuffle_by` and :py:meth:`DataArray.shuffle_by` can also take Grouper objects:
+
+.. ipython:: python
+
+    from xarray.groupers import UniqueGrouper
+
+    da.shuffle_by(label=UniqueGrouper())
+
+
+Shuffling can also be performed on :py:class:`DatasetGroupBy` and :py:class:`DataArrayGroupBy` objects.
+The :py:meth:`DatasetGroupBy.shuffle` and :py:meth:`DataArrayGroupBy.shuffle` methods return new :py:class:`DatasetGroupBy` and :py:class:`DataArrayGroupBy` objects that operate on the shuffled Dataset or DataArray respectively.
+
+
+.. ipython:: python
+
+    da.groupby(label=UniqueGrouper()).shuffle()
+
+
+For chunked array types (e.g. dask or cubed), shuffle may result in a more optimized communication pattern when compared to direct indexing by the appropriate indexer.
+Shuffling also makes GroupBy operations on chunked arrays an embarrassingly parallel problem, and may significantly improve workloads that use :py:meth:`DatasetGroupBy.map` or :py:meth:`DataArrayGroupBy.map`.
diff --git a/xarray/core/common.py b/xarray/core/common.py
@@ -52,7 +52,7 @@
         T_Variable,
     )
     from xarray.core.variable import Variable
-    from xarray.groupers import Resampler
+    from xarray.groupers import Grouper, Resampler
 
     DTypeMaybeMapping = Union[DTypeLikeSave, Mapping[Any, DTypeLikeSave]]
 
@@ -891,6 +891,68 @@ def rolling_exp(
 
         return rolling_exp.RollingExp(self, window, window_type)
 
+    def shuffle_by(
+        self,
+        group: Hashable | DataArray | Mapping[Any, Grouper] | None = None,
+        chunks: T_Chunks = None,
+        **groupers: Grouper,
+    ) -> Self:
+        """
+        Sort or "shuffle" this object by a Grouper.
+
+        "Shuffle" means the object is sorted so that all group members occur sequentially,
+        in the same chunk. Multiple groups may occur in the same chunk.
+        This method is particularly useful for chunked arrays (e.g. dask, cubed).
+        For chunked array types, the order of appearance is not guaranteed, but will depend on
+        the input chunking.
+
+        Parameters
+        ----------
+        group : Hashable or DataArray or IndexVariable or mapping of Hashable to Grouper
+            Array whose unique values should be used to group this array. If a
+            Hashable, must be the name of a coordinate contained in this dataarray. If a dictionary,
+            must map an existing variable name to a :py:class:`Grouper` instance.
+        chunks : int, tuple of int, "auto" or mapping of hashable to int or tuple of int, optional
+            How to adjust chunks along dimensions not present in the array being grouped by.
+        **groupers : Grouper
+           Grouper objects using which to shuffle the data.
+
+        Examples
+        --------
+        >>> import dask
+        >>> from xarray.groupers import UniqueGrouper
+        >>> da = xr.DataArray(
+        ...     dims="x",
+        ...     data=dask.array.arange(10, chunks=1),
+        ...     coords={"x": [1, 2, 3, 1, 2, 3, 1, 2, 3, 0]},
+        ...     name="a",
+        ... )
+        >>> da
+        <xarray.DataArray 'a' (x: 10)> Size: 80B
+        dask.array<arange, shape=(10,), dtype=int64, chunksize=(1,), chunktype=numpy.ndarray>
+        Coordinates:
+          * x        (x) int64 80B 1 2 3 1 2 3 1 2 3 0
+
+        >>> da.shuffle_by(x=UniqueGrouper())
+        <xarray.DataArray 'a' (x: 10)> Size: 80B
+        dask.array<shuffle, shape=(10,), dtype=int64, chunksize=(3,), chunktype=numpy.ndarray>
+        Coordinates:
+          * x        (x) int64 80B 0 1 1 1 2 2 2 3 3 3
+
+        Returns
+        -------
+        DataArray or Dataset
+            The same type as this object
+
+        See Also
+        --------
+        DataArrayGroupBy.shuffle
+        DatasetGroupBy.shuffle
+        dask.dataframe.DataFrame.shuffle
+        dask.array.shuffle
+        """
+        return self.groupby(group=group, **groupers)._shuffle_obj(chunks)
+
     def _resample(
         self,
         resample_cls: type[T_Resample],

diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py
@@ -117,6 +117,7 @@
         Self,
         SideOptions,
         T_ChunkDimFreq,
+        T_Chunks,
         T_ChunksFreq,
         T_Xarray,
     )
@@ -661,6 +662,12 @@ def _to_dataset_whole(
         coord_names = set(self._coords)
         return Dataset._construct_direct(variables, coord_names, indexes=indexes)
 
+    def _shuffle(self, dim, *, indices: list[list[int]], chunks: T_Chunks) -> None:
+        shuffled = self._to_temp_dataset()._shuffle(
+            dim=dim, indices=indices, chunks=chunks
+        )
+        return self._from_temp_dataset(shuffled)
+
     def to_dataset(
         self,
         dim: Hashable = None,

diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
@@ -166,6 +166,7 @@
         ResampleCompatible,
         SideOptions,
         T_ChunkDimFreq,
+        T_Chunks,
         T_DatasetPadConstantValues,
         T_Xarray,
     )
@@ -3236,6 +3237,30 @@ def sel(
         result = self.isel(indexers=query_results.dim_indexers, drop=drop)
         return result._overwrite_indexes(*query_results.as_tuple()[1:])
 
+    def _shuffle(self, dim, *, indices: list[list[int]], chunks: T_Chunks) -> Self:
+        # Shuffling is only different from `isel` for chunked arrays.
+        # Extract them out, and treat them specially. The rest, we route through isel.
+        # This makes it easy to ensure correct handling of indexes.
+        is_chunked = {
+            name: var
+            for name, var in self._variables.items()
+            if is_chunked_array(var._data)
+        }
+        subset = self[[name for name in self._variables if name not in is_chunked]]
+
+        shuffled = (
+            subset
+            if dim not in subset.dims
+            else subset.isel({dim: np.concatenate(indices)})
+        )
+        for name, var in is_chunked.items():
+            shuffled[name] = var._shuffle(
+                indices=indices,
+                dim=dim,
+                chunks=chunks,
+            )
+        return shuffled
+
     def head(
         self,
         indexers: Mapping[Any, int] | int | None = None,