From c54a27436eeb2f77ad2381583a4d8b0efe381bad Mon Sep 17 00:00:00 2001
From: Sultan Orazbayev <contact@econpoint.com>
Date: Mon, 23 Jan 2023 22:34:55 +0600
Subject: [PATCH] Add conversions to/from pydata/sparse to graphblas.io (#347)

Add conversions to and from `sparse >=0.12` (a.k.a. PyData/sparse)
---
 .github/workflows/test_and_build.yml | 13 ++--
 docs/api_reference/index.rst         |  9 +++
 docs/user_guide/io.rst               |  8 +++
 graphblas/io.py                      | 91 ++++++++++++++++++++++++++++
 graphblas/tests/test_io.py           | 56 ++++++++++++++++-
 pyproject.toml                       | 12 +++-
 6 files changed, 180 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/test_and_build.yml b/.github/workflows/test_and_build.yml
index 4751e4e39..dbef77075 100644
--- a/.github/workflows/test_and_build.yml
+++ b/.github/workflows/test_and_build.yml
@@ -142,6 +142,7 @@ jobs:
           # Consider removing old versions when they become problematic or very old (>=2 years).
           nxver=$(python -c 'import random ; print(random.choice(["=2.7", "=2.8", "=3.0", ""]))')
           yamlver=$(python -c 'import random ; print(random.choice(["=5.4", "=6.0", ""]))')
+          sparsever=$(python -c 'import random ; print(random.choice(["=0.12", "=0.13", ""]))')
           if [[ ${{ steps.pyver.outputs.selected }} == "3.8" ]]; then
             npver=$(python -c 'import random ; print(random.choice(["=1.21", "=1.22", "=1.23", ""]))')
             spver=$(python -c 'import random ; print(random.choice(["=1.8", "=1.9", "=1.10", ""]))')
@@ -183,10 +184,10 @@ jobs:
           else
             numbaver=$(python -c 'import random ; print(random.choice(["=0.56", ""]))')
           fi
-          echo "versions: np${npver} sp${spver} pd${pdver} ak${akver} nx${nxver} numba${numbaver} yaml${yamlver} psgver${psgver}"
+          echo "versions: np${npver} sp${spver} pd${pdver} ak${akver} nx${nxver} numba${numbaver} yaml${yamlver} sparse${sparsever} psgver${psgver}"
 
           # Once we have wheels for all OSes, we can delete the last two lines.
-          mamba install pytest coverage coveralls=3.3.1 pytest-randomly cffi donfig pyyaml${yamlver} \
+          mamba install pytest coverage coveralls=3.3.1 pytest-randomly cffi donfig pyyaml${yamlver} sparse${sparsever} \
             pandas${pdver} scipy${spver} numpy${npver} awkward${akver} networkx${nxver} numba${numbaver} \
             ${{ matrix.slowtask == 'pytest_bizarro' && 'black' || '' }} \
             ${{ matrix.slowtask == 'notebooks' && 'matplotlib nbconvert jupyter "ipython>=7"' || '' }} \
@@ -233,7 +234,7 @@ jobs:
             if [[ $H && $normal  ]] ; then if [[ $macos  ]] ; then echo " $vanilla"     ; elif [[ $windows ]] ; then echo " $suitesparse" ; fi ; fi)$( \
             if [[ $H && $bizarro ]] ; then if [[ $macos  ]] ; then echo " $suitesparse" ; elif [[ $windows ]] ; then echo " $vanilla"     ; fi ; fi)
           echo $args
-          coverage run -m pytest --color=yes -Werror --randomly -v $args \
+          coverage run -m pytest --color=yes --randomly -v $args \
             ${{ matrix.slowtask == 'pytest_normal' && '--runslow' || '' }}
       - name: Unit tests (bizarro scalars)
         run: |
@@ -268,7 +269,7 @@ jobs:
             if [[ $H && $normal  ]] ; then if [[ $macos  ]] ; then echo " $suitesparse" ; elif [[ $windows ]] ; then echo " $vanilla"     ; fi ; fi)$( \
             if [[ $H && $bizarro ]] ; then if [[ $macos  ]] ; then echo " $vanilla"     ; elif [[ $windows ]] ; then echo " $suitesparse" ; fi ; fi)
           echo $args
-          coverage run -a -m pytest --color=yes -Werror --randomly -v $args \
+          coverage run -a -m pytest --color=yes --randomly -v $args \
             ${{ matrix.slowtask == 'pytest_bizarro' && '--runslow' || '' }}
           git checkout .  # Undo changes to scalar default
       - name: Miscellaneous tests
@@ -289,8 +290,8 @@ jobs:
           rm script.py
           # Tests whose coverage depend on order of tests :/
           # TODO: understand why these are order-dependent and try to fix
-          coverage run -a -m pytest --color=yes -Werror -x --no-mapnumpy -k test_binaryop_attributes_numpy graphblas/tests/test_op.py
-          # coverage run -a -m pytest --color=yes -Werror -x --no-mapnumpy -k test_npmonoid graphblas/tests/test_numpyops.py --runslow
+          coverage run -a -m pytest --color=yes -x --no-mapnumpy -k test_binaryop_attributes_numpy graphblas/tests/test_op.py
+          # coverage run -a -m pytest --color=yes -x --no-mapnumpy -k test_npmonoid graphblas/tests/test_numpyops.py --runslow
       - name: Auto-generated code check
         if: matrix.slowtask == 'pytest_bizarro'
         run: |
diff --git a/docs/api_reference/index.rst b/docs/api_reference/index.rst
index 53ed902d9..2f829e29a 100644
--- a/docs/api_reference/index.rst
+++ b/docs/api_reference/index.rst
@@ -99,6 +99,15 @@ These methods require `scipy <https://scipy.org/>`_ to be installed.
 
 .. autofunction:: graphblas.io.to_scipy_sparse
 
+PyData Sparse
+~~~~~~~~~~~~~
+
+These methods require `sparse <https://sparse.pydata.org/>`_ to be installed.
+
+.. autofunction:: graphblas.io.from_pydata_sparse
+
+.. autofunction:: graphblas.io.to_pydata_sparse
+
 Matrix Market
 ~~~~~~~~~~~~~
 
diff --git a/docs/user_guide/io.rst b/docs/user_guide/io.rst
index d31932053..52211c465 100644
--- a/docs/user_guide/io.rst
+++ b/docs/user_guide/io.rst
@@ -68,6 +68,14 @@ that drop zero-weighted edges. The conversion from python-graphblas to scipy.spa
 zero-weighted edges, but the user should be aware of the potential for errors occurring when zero-weighted
 edges are handled by scipy.sparse.
 
+PyData.Sparse
+-------------
+
+A python-graphblas Matrix can be created from a 2-D (PyData) sparse array or matrix using
+``gb.io.from_pydata_sparse()``.
+
+``gb.io.to_pydata_sparse()`` will output a 2-D (PyData) sparse array given a python-graphblas Matrix.
+The sparse format can be specified. It defaults to "coo".
 
 Numpy (Dense)
 -------------
diff --git a/graphblas/io.py b/graphblas/io.py
index 59e4a3a0c..5111371ea 100644
--- a/graphblas/io.py
+++ b/graphblas/io.py
@@ -220,6 +220,60 @@ def from_awkward(A, *, name=None):
     raise ValueError(f"Invalid format for Matrix: {format}")
 
 
+def from_pydata_sparse(s, *, dup_op=None, name=None):
+    """Create a Vector or a Matrix from a pydata.sparse array or matrix.
+
+    Input data in "gcxs" format will be efficient when importing with SuiteSparse:GraphBLAS.
+
+    Parameters
+    ----------
+    s : sparse
+        PyData sparse array or matrix (see https://sparse.pydata.org)
+    dup_op : BinaryOp, optional
+        Aggregation function for formats that allow duplicate entries (e.g. coo)
+    name : str, optional
+        Name of resulting Matrix
+
+    Returns
+    -------
+    :class:`~graphblas.Vector`
+    :class:`~graphblas.Matrix`
+    """
+    try:
+        import sparse
+    except ImportError:  # pragma: no cover (import)
+        raise ImportError("sparse is required to import from pydata sparse") from None
+    if not isinstance(s, sparse.SparseArray):
+        raise TypeError(
+            "from_pydata_sparse only accepts objects from the `sparse` library; "
+            "see https://sparse.pydata.org"
+        )
+    if s.ndim > 2:
+        raise _GraphblasException("m.ndim must be <= 2")
+
+    if s.ndim == 1:
+        # the .asformat('coo') makes it easier to convert dok/gcxs using a single approach
+        _s = s.asformat("coo")
+        return _Vector.from_coo(
+            _s.coords, _s.data, dtype=_s.dtype, size=_s.shape[0], dup_op=dup_op, name=name
+        )
+    # handle two-dimensional arrays
+    if isinstance(s, sparse.GCXS):
+        return from_scipy_sparse(s.to_scipy_sparse(), dup_op=dup_op, name=name)
+    if isinstance(s, (sparse.DOK, sparse.COO)):
+        _s = s.asformat("coo")
+        return _Matrix.from_coo(
+            *_s.coords,
+            _s.data,
+            nrows=_s.shape[0],
+            ncols=_s.shape[1],
+            dtype=_s.dtype,
+            dup_op=dup_op,
+            name=name,
+        )
+    raise ValueError(f"Unknown sparse array type: {type(s).__name__}")  # pragma: no cover (safety)
+
+
 # TODO: add parameters to allow different networkx classes and attribute names
 def to_networkx(m, edge_attribute="weight"):
     """Create a networkx DiGraph from a square adjacency Matrix.
@@ -482,6 +536,43 @@ def indices(self):
     return ret
 
 
+def to_pydata_sparse(A, format="coo"):
+    """Create a pydata.sparse array from a GraphBLAS Matrix or Vector.
+
+    Parameters
+    ----------
+    A : Matrix or Vector
+        GraphBLAS object to be converted
+    format : str
+        {'coo', 'dok', 'gcxs'}
+
+    Returns
+    -------
+    sparse array (see https://sparse.pydata.org)
+
+    """
+    try:
+        from sparse import COO
+    except ImportError:  # pragma: no cover (import)
+        raise ImportError("sparse is required to export to pydata sparse") from None
+
+    format = format.lower()
+    if format not in {"coo", "dok", "gcxs"}:
+        raise ValueError(f"Invalid format: {format}")
+
+    if format == "gcxs":
+        B = to_scipy_sparse(A, format="csr")
+    else:
+        # obtain an intermediate conversion via hardcoded 'coo' intermediate object
+        B = to_scipy_sparse(A, format="coo")
+
+    # convert to pydata.sparse
+    s = COO.from_scipy_sparse(B)
+
+    # express in the desired format
+    return s.asformat(format)
+
+
 def mmread(source, *, dup_op=None, name=None):
     """Create a GraphBLAS Matrix from the contents of a Matrix Market file.
 
diff --git a/graphblas/tests/test_io.py b/graphblas/tests/test_io.py
index d1edba027..eb743daaa 100644
--- a/graphblas/tests/test_io.py
+++ b/graphblas/tests/test_io.py
@@ -5,16 +5,23 @@
 
 import graphblas as gb
 from graphblas import Matrix, dtypes
+from graphblas.exceptions import GraphblasException
 
 try:
     import networkx as nx
 except ImportError:  # pragma: no cover (import)
     nx = None
+
 try:
     import scipy.sparse as ss
 except ImportError:  # pragma: no cover (import)
     ss = None
 
+try:
+    import sparse
+except ImportError:  # pragma: no cover (import)
+    sparse = None
+
 try:
     import awkward._v2 as ak
 except ImportError:
@@ -89,7 +96,7 @@ def test_matrix_to_from_numpy():
     with pytest.raises(ValueError, match="Invalid format"):
         gb.io.to_scipy_sparse(M, "bad format")
 
-    with pytest.raises(gb.exceptions.GraphblasException, match="ndim must be"):
+    with pytest.raises(GraphblasException, match="ndim must be"):
         gb.io.from_numpy(np.array([[[1.0, 0.0], [2.0, 3.7]]]))
 
 
@@ -386,3 +393,50 @@ def test_awkward_errors():
         gb.io.to_awkward(m, format="dcsr")
     with pytest.raises(TypeError):
         gb.io.to_awkward(gb.Scalar.from_value(5))
+
+
+@pytest.mark.skipif("not sparse")
+def test_vector_to_from_pydata_sparse():
+    coords = np.array([0, 1, 2, 3, 4], dtype="int64")
+    data = np.array([10, 20, 30, 40, 50], dtype="int64")
+    s = sparse.COO(coords, data, shape=(5,))
+    v = gb.io.from_pydata_sparse(s)
+    assert v.isequal(gb.Vector.from_coo(coords, data, dtype=dtypes.INT64), check_dtype=True)
+
+    t = gb.io.to_pydata_sparse(v)
+    assert t == s
+
+
+@pytest.mark.skipif("not sparse")
+def test_matrix_to_from_pydata_sparse():
+    coords = np.array([[0, 1, 2, 3, 4], [0, 1, 2, 3, 4]], dtype="int64")
+    data = np.array([10, 20, 30, 40, 50], dtype="int64")
+    s = sparse.COO(coords, data, shape=(5, 5))
+    v = gb.io.from_pydata_sparse(s)
+    assert v.isequal(gb.Matrix.from_coo(*coords, data, dtype=dtypes.INT64), check_dtype=False)
+
+    t = gb.io.to_pydata_sparse(v)
+    assert t == s
+
+    # test ndim
+    e = sparse.random(shape=(5, 5, 5), density=0)
+    with pytest.raises(GraphblasException):
+        gb.io.from_pydata_sparse(e)
+
+    # test GCXS array conversion
+    indptr = np.array([0, 2, 3, 6], dtype="int64")
+    indices = np.array([0, 2, 2, 0, 1, 2], dtype="int64")
+    data = np.array([1, 2, 3, 4, 5, 6], dtype="int64")
+
+    g = sparse.GCXS((data, indices, indptr), shape=(3, 3), compressed_axes=[0])
+    w = gb.io.from_pydata_sparse(g)
+    coords = g.asformat("coo").coords
+    data = g.asformat("coo").data
+    assert w.isequal(gb.Matrix.from_coo(*coords, data, dtype=dtypes.INT64), check_dtype=False)
+
+    r = gb.io.to_pydata_sparse(w, format="gcxs")
+    assert r == g
+    with pytest.raises(ValueError, match="format"):
+        gb.io.to_pydata_sparse(w, format="badformat")
+    with pytest.raises(TypeError, match="sparse.pydata"):
+        gb.io.from_pydata_sparse(w)
diff --git a/pyproject.toml b/pyproject.toml
index 07a99b129..3151e0b25 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -69,9 +69,10 @@ io = [
     "networkx >=2.8",
     "scipy >=1.8",
     "awkward >=1.9",
+    "sparse >=0.12",
 ]
 viz = [
-    "matplotlib",
+    "matplotlib >=3.5",
 ]
 test = [
     "pytest",
@@ -83,7 +84,8 @@ complete = [
     "networkx >=2.8",
     "scipy >=1.8",
     "awkward >=1.9",
-    "matplotlib",
+    "sparse >=0.12",
+    "matplotlib >=3.5",
     "pytest",
 ]
 
@@ -132,6 +134,12 @@ xfail_strict = true
 markers = [
     "slow: Skipped unless --runslow passed",
 ]
+filterwarnings = [
+    # See: https://docs.python.org/3/library/warnings.html#describing-warning-filters
+    # and: https://docs.pytest.org/en/7.2.x/how-to/capture-warnings.html#controlling-warnings
+    "error",
+    "ignore:`np.bool` is a deprecated alias:DeprecationWarning:sparse._umath",  # sparse <0.13
+]
 
 [tool.coverage.run]
 branch = true