Connection to Awkward Arrays (python-graphblas#280)

* Add io.to_awkward method Converts a Vector or Matrix to an Awkward Array. While an Awkward Array elegantly mimics pointers and values, it does not naturally include the indices of a CSR matrix. However, Awkward Arrays handle JSON-like structures of arrays, so we can easily add values, indices, and even labels (in the case of hypersparse), allowing a proper round trip journey from GraphBLAS to Awkward Arrays. Awkward does not support iso-valued objects, so that will be expanded, but yield equivalent information. Awkward Array also does not currently support u64 for indexes, so we are using i64. This should be fine as SuiteSparse:GraphBLAS only allows indices up to 2**60, which means they fit in int64 even though GraphBLAS internally stores them as uint64.
jim22k · Oct 3, 2022 · ef0cdcc · ef0cdcc
1 parent 2ea41e6
commit ef0cdcc
Show file tree

Hide file tree

Showing 4 changed files with 244 additions and 2 deletions.
diff --git a/.github/workflows/test_and_build.yml b/.github/workflows/test_and_build.yml
@@ -62,7 +62,7 @@ jobs:
           black . --check --diff
       - name: Build extension module
         run: |
-          conda install -c conda-forge pandas numba scipy networkx cffi donfig pyyaml
+          conda install -c conda-forge pandas numba scipy networkx cffi donfig pyyaml awkward
           if [[ ${{ matrix.cfg.sourcetype }} == "wheel" ]]; then
               pip install suitesparse-graphblas
           else

diff --git a/graphblas/io.py b/graphblas/io.py
@@ -189,6 +189,62 @@ def from_scipy_sparse(A, *, dup_op=None, name=None):
     )
 
 
+def from_awkward(A, *, name=None):
+    """Create a Matrix or Vector from an Awkward Array.
+
+    The Awkward Array must have top-level parameters: format, shape
+
+    The Awkward Array must have top-level attributes based on format:
+    - vec/csr/csc: values, indices
+    - hypercsr/hypercsc: values, indices, offset_labels
+
+    Parameters
+    ----------
+    A : awkward.Array
+        Awkward Array with values and indices
+    name : str, optional
+        Name of resulting Matrix or Vector
+
+    Returns
+    -------
+    Vector or Matrix
+    """
+    params = A.layout.parameters
+    missing = {"format", "shape"} - params.keys()
+    if missing:
+        raise ValueError(f"Missing parameters: {missing}")
+    format = params["format"]
+    shape = params["shape"]
+
+    if len(shape) == 1:
+        if format != "vec":
+            raise ValueError(f"Invalid format for Vector: {format}")
+        return _Vector.from_values(
+            A.indices.layout.data, A.values.layout.data, size=shape[0], name=name
+        )
+    else:
+        if format not in {"csr", "csc", "hypercsr", "hypercsc"}:
+            raise ValueError(f"Invalid format for Matrix: {format}")
+        d = {
+            "format": format,
+            "nrows": shape[0],
+            "ncols": shape[1],
+            "values": A.values.layout.content.data,
+            "indptr": A.values.layout.offsets.data,
+        }
+        if format[-1] == "r":
+            indices = "col"
+            labels = "rows"
+        else:
+            indices = "row"
+            labels = "cols"
+        d[f"{indices}_indices"] = A.indices.layout.content.data
+        d[f"sorted_{indices}s"] = True
+        if format[:5] == "hyper":
+            d[labels] = A.offset_labels.layout.data
+        return _Matrix.ss.import_any(**d, name=name)
+
+
 # TODO: add parameters to allow different networkx classes and attribute names
 def to_networkx(m):
     """Create a networkx DiGraph from a square adjacency Matrix
@@ -319,6 +375,125 @@ def to_scipy_sparse(A, format="csr"):
     return rv.asformat(format)
 
 
+_AwkwardDoublyCompressedMatrix = None
+
+
+def to_awkward(A, format=None):
+    """Create an Awkward Array from a GraphBLAS Matrix
+
+    Parameters
+    ----------
+    A : Matrix or Vector
+        GraphBLAS object to be converted
+    format : str {'csr', 'csc', 'hypercsr', 'hypercsc', 'vec}
+        Default format is csr for Matrix; vec for Vector
+
+    The Awkward Array will have top-level attributes based on format:
+    - vec/csr/csc: values, indices
+    - hypercsr/hypercsc: values, indices, offset_labels
+
+    Top-level parameters will also be set: format, shape
+
+    Returns
+    -------
+    awkward.Array
+
+    """
+    import awkward._v2 as ak
+    from awkward._v2.forms.listoffsetform import ListOffsetForm
+    from awkward._v2.forms.numpyform import NumpyForm
+    from awkward._v2.forms.recordform import RecordForm
+
+    out_type = _output_type(A)
+    if format is None:
+        format = "vec" if out_type is _Vector else "csr"
+    format = format.lower()
+    classname = None
+
+    if out_type is _Vector:
+        if format != "vec":
+            raise ValueError(f"Invalid format for Vector: {format}")
+        size = A.nvals
+        indices, values = A.to_values()
+        form = RecordForm(
+            contents=[
+                NumpyForm(A.dtype.numba_type.name, form_key="node1"),
+                NumpyForm("int64", form_key="node0"),
+            ],
+            fields=["values", "indices"],
+        )
+        d = {"node0-data": indices, "node1-data": values}
+
+    elif out_type is _Matrix:
+        if _backend != "suitesparse":
+            raise NotImplementedError(
+                f"Conversion of Matrix to Awkward Array not supported for backend '{_backend}'"
+            )
+        if format not in {"csr", "csc", "hypercsr", "hypercsc"}:
+            raise ValueError(f"Invalid format for Matrix: {format}")
+        if format[-1] == "r":
+            size = A.nrows
+            indices = "col_indices"
+            labels = "rows"
+        else:
+            size = A.ncols
+            indices = "row_indices"
+            labels = "cols"
+        info = A.ss.export(format, sort=True)
+        if info["is_iso"]:
+            info["values"] = _np.ascontiguousarray(_np.broadcast_to(info["values"], A.nvals))
+        form = ListOffsetForm(
+            "i64",
+            RecordForm(
+                contents=[
+                    NumpyForm("int64", form_key="node3"),
+                    NumpyForm(A.dtype.numba_type.name, form_key="node4"),
+                ],
+                fields=["indices", "values"],
+            ),
+            form_key="node1",
+        )
+        d = {
+            "node1-offsets": info["indptr"],
+            "node3-data": info[indices],
+            "node4-data": info["values"],
+        }
+        if format.startswith("hyper"):
+            global _AwkwardDoublyCompressedMatrix
+            if _AwkwardDoublyCompressedMatrix is None:
+                # Define behaviors to make all fields function at the top-level
+                @ak.behaviors.mixins.mixin_class(ak.behavior)
+                class _AwkwardDoublyCompressedMatrix:
+                    @property
+                    def values(self):
+                        return self.data.values
+
+                    @property
+                    def indices(self):
+                        return self.data.indices
+
+            size = len(info[labels])
+            form = RecordForm(
+                contents=[
+                    form,
+                    NumpyForm("int64", form_key="node5"),
+                ],
+                fields=["data", "offset_labels"],
+            )
+            d["node5-data"] = info[labels]
+            classname = "_AwkwardDoublyCompressedMatrix"
+
+    else:
+        raise TypeError(f"A must be a Matrix or Vector, found {type(A)}")
+
+    ret = ak.from_buffers(form, size, d)
+    ret = ak.with_parameter(ret, "format", format)
+    ret = ak.with_parameter(ret, "shape", list(A.shape))
+    if classname:
+        ret = ak.with_name(ret, classname)
+    return ret
+
+
 def mmread(source, *, dup_op=None, name=None):
     """Create a GraphBLAS Matrix from the contents of a Matrix Market file.
 

diff --git a/graphblas/tests/test_io.py b/graphblas/tests/test_io.py
@@ -15,6 +15,11 @@
 except ImportError:  # pragma: no cover
     ss = None
 
+try:
+    import awkward._v2 as ak
+except ImportError:  # pragma: no cover
+    ak = None
+
 
 @pytest.mark.skipif("not ss")
 def test_vector_to_from_numpy():
@@ -302,3 +307,65 @@ def test_scipy_sparse():
                 assert sa.shape == M.shape
                 sa2 = gb.io.to_scipy_sparse(M, fmt)
                 assert (sa != sa2).nnz == 0
+
+
+@pytest.mark.skipif("not ak")
+def test_awkward_roundtrip():
+    # Vector
+    v = gb.Vector.from_values([1, 3, 5], [20, 21, -5], size=22)
+    for dtype in ["int16", "float32", "bool"]:
+        v1 = v.dup(dtype=dtype)
+        kv = gb.io.to_awkward(v1)
+        assert isinstance(kv, ak.Array)
+        v2 = gb.io.from_awkward(kv)
+        assert v2.isequal(v1)
+    # Matrix
+    m = gb.Matrix.from_values([0, 0, 3, 5], [1, 4, 0, 2], [1, 0, 2, -1], nrows=7, ncols=6)
+    for dtype in ["int16", "float32", "bool"]:
+        for format in ["csr", "csc", "hypercsr", "hypercsc"]:
+            m1 = m.dup(dtype=dtype)
+            km = gb.io.to_awkward(m1, format=format)
+            assert isinstance(km, ak.Array)
+            m2 = gb.io.from_awkward(km)
+            assert m2.isequal(m1)
+
+
+@pytest.mark.skipif("not ak")
+def test_awkward_iso_roundtrip():
+    # Vector
+    v = gb.Vector.from_values([1, 3, 5], [20, 20, 20], size=22)
+    assert v.ss.is_iso
+    kv = gb.io.to_awkward(v)
+    assert isinstance(kv, ak.Array)
+    v2 = gb.io.from_awkward(kv)
+    assert v2.isequal(v)
+    # Matrix
+    m = gb.Matrix.from_values([0, 0, 3, 5], [1, 4, 0, 2], [1, 1, 1, 1], nrows=7, ncols=6)
+    assert m.ss.is_iso
+    for format in ["csr", "csc", "hypercsr", "hypercsc"]:
+        km = gb.io.to_awkward(m, format=format)
+        assert isinstance(km, ak.Array)
+        m2 = gb.io.from_awkward(km)
+        assert m2.isequal(m)
+
+
+@pytest.mark.skipif("not ak")
+def test_awkward_errors():
+    v = gb.Vector.from_values([1, 3, 5], [20, 20, 20], size=22)
+    m = gb.Matrix.from_values([0, 0, 3, 5], [1, 4, 0, 2], [1, 1, 1, 1], nrows=7, ncols=6)
+    with pytest.raises(ValueError, match="Missing parameters"):
+        gb.io.from_awkward(ak.Array([1, 2, 3]))
+    with pytest.raises(ValueError, match="Invalid format for Vector"):
+        kv = gb.io.to_awkward(v)
+        kv = ak.with_parameter(kv, "format", "csr")
+        gb.io.from_awkward(kv)
+    with pytest.raises(ValueError, match="Invalid format for Matrix"):
+        km = gb.io.to_awkward(m)
+        km = ak.with_parameter(km, "format", "dcsr")
+        gb.io.from_awkward(km)
+    with pytest.raises(ValueError, match="Invalid format for Vector"):
+        gb.io.to_awkward(v, format="csr")
+    with pytest.raises(ValueError, match="Invalid format for Matrix"):
+        gb.io.to_awkward(m, format="dcsr")
+    with pytest.raises(TypeError):
+        gb.io.to_awkward(gb.Scalar.from_value(5))
diff --git a/setup.py b/setup.py
@@ -4,7 +4,7 @@
 
 extras_require = {
     "repr": ["pandas"],
-    "io": ["networkx", "scipy >=1.7.0"],
+    "io": ["networkx", "scipy >=1.7.0", "awkward"],
     "viz": ["matplotlib"],
     "test": ["pytest", "pandas"],
 }