From c54a27436eeb2f77ad2381583a4d8b0efe381bad Mon Sep 17 00:00:00 2001 From: Sultan Orazbayev Date: Mon, 23 Jan 2023 22:34:55 +0600 Subject: [PATCH] Add conversions to/from pydata/sparse to graphblas.io (#347) Add conversions to and from `sparse >=0.12` (a.k.a. PyData/sparse) --- .github/workflows/test_and_build.yml | 13 ++-- docs/api_reference/index.rst | 9 +++ docs/user_guide/io.rst | 8 +++ graphblas/io.py | 91 ++++++++++++++++++++++++++++ graphblas/tests/test_io.py | 56 ++++++++++++++++- pyproject.toml | 12 +++- 6 files changed, 180 insertions(+), 9 deletions(-) diff --git a/.github/workflows/test_and_build.yml b/.github/workflows/test_and_build.yml index 4751e4e39..dbef77075 100644 --- a/.github/workflows/test_and_build.yml +++ b/.github/workflows/test_and_build.yml @@ -142,6 +142,7 @@ jobs: # Consider removing old versions when they become problematic or very old (>=2 years). nxver=$(python -c 'import random ; print(random.choice(["=2.7", "=2.8", "=3.0", ""]))') yamlver=$(python -c 'import random ; print(random.choice(["=5.4", "=6.0", ""]))') + sparsever=$(python -c 'import random ; print(random.choice(["=0.12", "=0.13", ""]))') if [[ ${{ steps.pyver.outputs.selected }} == "3.8" ]]; then npver=$(python -c 'import random ; print(random.choice(["=1.21", "=1.22", "=1.23", ""]))') spver=$(python -c 'import random ; print(random.choice(["=1.8", "=1.9", "=1.10", ""]))') @@ -183,10 +184,10 @@ jobs: else numbaver=$(python -c 'import random ; print(random.choice(["=0.56", ""]))') fi - echo "versions: np${npver} sp${spver} pd${pdver} ak${akver} nx${nxver} numba${numbaver} yaml${yamlver} psgver${psgver}" + echo "versions: np${npver} sp${spver} pd${pdver} ak${akver} nx${nxver} numba${numbaver} yaml${yamlver} sparse${sparsever} psgver${psgver}" # Once we have wheels for all OSes, we can delete the last two lines. - mamba install pytest coverage coveralls=3.3.1 pytest-randomly cffi donfig pyyaml${yamlver} \ + mamba install pytest coverage coveralls=3.3.1 pytest-randomly cffi donfig pyyaml${yamlver} sparse${sparsever} \ pandas${pdver} scipy${spver} numpy${npver} awkward${akver} networkx${nxver} numba${numbaver} \ ${{ matrix.slowtask == 'pytest_bizarro' && 'black' || '' }} \ ${{ matrix.slowtask == 'notebooks' && 'matplotlib nbconvert jupyter "ipython>=7"' || '' }} \ @@ -233,7 +234,7 @@ jobs: if [[ $H && $normal ]] ; then if [[ $macos ]] ; then echo " $vanilla" ; elif [[ $windows ]] ; then echo " $suitesparse" ; fi ; fi)$( \ if [[ $H && $bizarro ]] ; then if [[ $macos ]] ; then echo " $suitesparse" ; elif [[ $windows ]] ; then echo " $vanilla" ; fi ; fi) echo $args - coverage run -m pytest --color=yes -Werror --randomly -v $args \ + coverage run -m pytest --color=yes --randomly -v $args \ ${{ matrix.slowtask == 'pytest_normal' && '--runslow' || '' }} - name: Unit tests (bizarro scalars) run: | @@ -268,7 +269,7 @@ jobs: if [[ $H && $normal ]] ; then if [[ $macos ]] ; then echo " $suitesparse" ; elif [[ $windows ]] ; then echo " $vanilla" ; fi ; fi)$( \ if [[ $H && $bizarro ]] ; then if [[ $macos ]] ; then echo " $vanilla" ; elif [[ $windows ]] ; then echo " $suitesparse" ; fi ; fi) echo $args - coverage run -a -m pytest --color=yes -Werror --randomly -v $args \ + coverage run -a -m pytest --color=yes --randomly -v $args \ ${{ matrix.slowtask == 'pytest_bizarro' && '--runslow' || '' }} git checkout . # Undo changes to scalar default - name: Miscellaneous tests @@ -289,8 +290,8 @@ jobs: rm script.py # Tests whose coverage depend on order of tests :/ # TODO: understand why these are order-dependent and try to fix - coverage run -a -m pytest --color=yes -Werror -x --no-mapnumpy -k test_binaryop_attributes_numpy graphblas/tests/test_op.py - # coverage run -a -m pytest --color=yes -Werror -x --no-mapnumpy -k test_npmonoid graphblas/tests/test_numpyops.py --runslow + coverage run -a -m pytest --color=yes -x --no-mapnumpy -k test_binaryop_attributes_numpy graphblas/tests/test_op.py + # coverage run -a -m pytest --color=yes -x --no-mapnumpy -k test_npmonoid graphblas/tests/test_numpyops.py --runslow - name: Auto-generated code check if: matrix.slowtask == 'pytest_bizarro' run: | diff --git a/docs/api_reference/index.rst b/docs/api_reference/index.rst index 53ed902d9..2f829e29a 100644 --- a/docs/api_reference/index.rst +++ b/docs/api_reference/index.rst @@ -99,6 +99,15 @@ These methods require `scipy `_ to be installed. .. autofunction:: graphblas.io.to_scipy_sparse +PyData Sparse +~~~~~~~~~~~~~ + +These methods require `sparse `_ to be installed. + +.. autofunction:: graphblas.io.from_pydata_sparse + +.. autofunction:: graphblas.io.to_pydata_sparse + Matrix Market ~~~~~~~~~~~~~ diff --git a/docs/user_guide/io.rst b/docs/user_guide/io.rst index d31932053..52211c465 100644 --- a/docs/user_guide/io.rst +++ b/docs/user_guide/io.rst @@ -68,6 +68,14 @@ that drop zero-weighted edges. The conversion from python-graphblas to scipy.spa zero-weighted edges, but the user should be aware of the potential for errors occurring when zero-weighted edges are handled by scipy.sparse. +PyData.Sparse +------------- + +A python-graphblas Matrix can be created from a 2-D (PyData) sparse array or matrix using +``gb.io.from_pydata_sparse()``. + +``gb.io.to_pydata_sparse()`` will output a 2-D (PyData) sparse array given a python-graphblas Matrix. +The sparse format can be specified. It defaults to "coo". Numpy (Dense) ------------- diff --git a/graphblas/io.py b/graphblas/io.py index 59e4a3a0c..5111371ea 100644 --- a/graphblas/io.py +++ b/graphblas/io.py @@ -220,6 +220,60 @@ def from_awkward(A, *, name=None): raise ValueError(f"Invalid format for Matrix: {format}") +def from_pydata_sparse(s, *, dup_op=None, name=None): + """Create a Vector or a Matrix from a pydata.sparse array or matrix. + + Input data in "gcxs" format will be efficient when importing with SuiteSparse:GraphBLAS. + + Parameters + ---------- + s : sparse + PyData sparse array or matrix (see https://sparse.pydata.org) + dup_op : BinaryOp, optional + Aggregation function for formats that allow duplicate entries (e.g. coo) + name : str, optional + Name of resulting Matrix + + Returns + ------- + :class:`~graphblas.Vector` + :class:`~graphblas.Matrix` + """ + try: + import sparse + except ImportError: # pragma: no cover (import) + raise ImportError("sparse is required to import from pydata sparse") from None + if not isinstance(s, sparse.SparseArray): + raise TypeError( + "from_pydata_sparse only accepts objects from the `sparse` library; " + "see https://sparse.pydata.org" + ) + if s.ndim > 2: + raise _GraphblasException("m.ndim must be <= 2") + + if s.ndim == 1: + # the .asformat('coo') makes it easier to convert dok/gcxs using a single approach + _s = s.asformat("coo") + return _Vector.from_coo( + _s.coords, _s.data, dtype=_s.dtype, size=_s.shape[0], dup_op=dup_op, name=name + ) + # handle two-dimensional arrays + if isinstance(s, sparse.GCXS): + return from_scipy_sparse(s.to_scipy_sparse(), dup_op=dup_op, name=name) + if isinstance(s, (sparse.DOK, sparse.COO)): + _s = s.asformat("coo") + return _Matrix.from_coo( + *_s.coords, + _s.data, + nrows=_s.shape[0], + ncols=_s.shape[1], + dtype=_s.dtype, + dup_op=dup_op, + name=name, + ) + raise ValueError(f"Unknown sparse array type: {type(s).__name__}") # pragma: no cover (safety) + + # TODO: add parameters to allow different networkx classes and attribute names def to_networkx(m, edge_attribute="weight"): """Create a networkx DiGraph from a square adjacency Matrix. @@ -482,6 +536,43 @@ def indices(self): return ret +def to_pydata_sparse(A, format="coo"): + """Create a pydata.sparse array from a GraphBLAS Matrix or Vector. + + Parameters + ---------- + A : Matrix or Vector + GraphBLAS object to be converted + format : str + {'coo', 'dok', 'gcxs'} + + Returns + ------- + sparse array (see https://sparse.pydata.org) + + """ + try: + from sparse import COO + except ImportError: # pragma: no cover (import) + raise ImportError("sparse is required to export to pydata sparse") from None + + format = format.lower() + if format not in {"coo", "dok", "gcxs"}: + raise ValueError(f"Invalid format: {format}") + + if format == "gcxs": + B = to_scipy_sparse(A, format="csr") + else: + # obtain an intermediate conversion via hardcoded 'coo' intermediate object + B = to_scipy_sparse(A, format="coo") + + # convert to pydata.sparse + s = COO.from_scipy_sparse(B) + + # express in the desired format + return s.asformat(format) + + def mmread(source, *, dup_op=None, name=None): """Create a GraphBLAS Matrix from the contents of a Matrix Market file. diff --git a/graphblas/tests/test_io.py b/graphblas/tests/test_io.py index d1edba027..eb743daaa 100644 --- a/graphblas/tests/test_io.py +++ b/graphblas/tests/test_io.py @@ -5,16 +5,23 @@ import graphblas as gb from graphblas import Matrix, dtypes +from graphblas.exceptions import GraphblasException try: import networkx as nx except ImportError: # pragma: no cover (import) nx = None + try: import scipy.sparse as ss except ImportError: # pragma: no cover (import) ss = None +try: + import sparse +except ImportError: # pragma: no cover (import) + sparse = None + try: import awkward._v2 as ak except ImportError: @@ -89,7 +96,7 @@ def test_matrix_to_from_numpy(): with pytest.raises(ValueError, match="Invalid format"): gb.io.to_scipy_sparse(M, "bad format") - with pytest.raises(gb.exceptions.GraphblasException, match="ndim must be"): + with pytest.raises(GraphblasException, match="ndim must be"): gb.io.from_numpy(np.array([[[1.0, 0.0], [2.0, 3.7]]])) @@ -386,3 +393,50 @@ def test_awkward_errors(): gb.io.to_awkward(m, format="dcsr") with pytest.raises(TypeError): gb.io.to_awkward(gb.Scalar.from_value(5)) + + +@pytest.mark.skipif("not sparse") +def test_vector_to_from_pydata_sparse(): + coords = np.array([0, 1, 2, 3, 4], dtype="int64") + data = np.array([10, 20, 30, 40, 50], dtype="int64") + s = sparse.COO(coords, data, shape=(5,)) + v = gb.io.from_pydata_sparse(s) + assert v.isequal(gb.Vector.from_coo(coords, data, dtype=dtypes.INT64), check_dtype=True) + + t = gb.io.to_pydata_sparse(v) + assert t == s + + +@pytest.mark.skipif("not sparse") +def test_matrix_to_from_pydata_sparse(): + coords = np.array([[0, 1, 2, 3, 4], [0, 1, 2, 3, 4]], dtype="int64") + data = np.array([10, 20, 30, 40, 50], dtype="int64") + s = sparse.COO(coords, data, shape=(5, 5)) + v = gb.io.from_pydata_sparse(s) + assert v.isequal(gb.Matrix.from_coo(*coords, data, dtype=dtypes.INT64), check_dtype=False) + + t = gb.io.to_pydata_sparse(v) + assert t == s + + # test ndim + e = sparse.random(shape=(5, 5, 5), density=0) + with pytest.raises(GraphblasException): + gb.io.from_pydata_sparse(e) + + # test GCXS array conversion + indptr = np.array([0, 2, 3, 6], dtype="int64") + indices = np.array([0, 2, 2, 0, 1, 2], dtype="int64") + data = np.array([1, 2, 3, 4, 5, 6], dtype="int64") + + g = sparse.GCXS((data, indices, indptr), shape=(3, 3), compressed_axes=[0]) + w = gb.io.from_pydata_sparse(g) + coords = g.asformat("coo").coords + data = g.asformat("coo").data + assert w.isequal(gb.Matrix.from_coo(*coords, data, dtype=dtypes.INT64), check_dtype=False) + + r = gb.io.to_pydata_sparse(w, format="gcxs") + assert r == g + with pytest.raises(ValueError, match="format"): + gb.io.to_pydata_sparse(w, format="badformat") + with pytest.raises(TypeError, match="sparse.pydata"): + gb.io.from_pydata_sparse(w) diff --git a/pyproject.toml b/pyproject.toml index 07a99b129..3151e0b25 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -69,9 +69,10 @@ io = [ "networkx >=2.8", "scipy >=1.8", "awkward >=1.9", + "sparse >=0.12", ] viz = [ - "matplotlib", + "matplotlib >=3.5", ] test = [ "pytest", @@ -83,7 +84,8 @@ complete = [ "networkx >=2.8", "scipy >=1.8", "awkward >=1.9", - "matplotlib", + "sparse >=0.12", + "matplotlib >=3.5", "pytest", ] @@ -132,6 +134,12 @@ xfail_strict = true markers = [ "slow: Skipped unless --runslow passed", ] +filterwarnings = [ + # See: https://docs.python.org/3/library/warnings.html#describing-warning-filters + # and: https://docs.pytest.org/en/7.2.x/how-to/capture-warnings.html#controlling-warnings + "error", + "ignore:`np.bool` is a deprecated alias:DeprecationWarning:sparse._umath", # sparse <0.13 +] [tool.coverage.run] branch = true