From 207811074923c3d3121f67d474825af4bde47b8c Mon Sep 17 00:00:00 2001 From: Erik Welch Date: Wed, 25 Jan 2023 11:31:01 -0600 Subject: [PATCH] Add `Matrix.from_edgelist` and `matrix.to_edgelist` (#374) * Add `Matrix.from_edgelist` and `matrix.to_edgelist` * Also allow `from_edgelist` to accept `(row, column, value)` triples * Add `Vector.from_pairs` that is similar to `Matrix.from_edgelist` * Move flake8 config to `.flake8` and remove `setup.cfg` * Don't allow `values` to be in NumPy arrays in `from_edgelist` --- setup.cfg => .flake8 | 9 -- .github/workflows/test_and_build.yml | 2 +- .pre-commit-config.yaml | 5 +- graphblas/core/automethods.py | 5 + graphblas/core/infix.py | 1 + graphblas/core/matrix.py | 151 +++++++++++++++++++++++++++ graphblas/core/vector.py | 69 ++++++++++++ graphblas/tests/test_matrix.py | 37 +++++++ graphblas/tests/test_op.py | 3 + graphblas/tests/test_vector.py | 19 ++++ pyproject.toml | 5 + scripts/check_versions.sh | 3 + 12 files changed, 296 insertions(+), 13 deletions(-) rename setup.cfg => .flake8 (76%) diff --git a/setup.cfg b/.flake8 similarity index 76% rename from setup.cfg rename to .flake8 index 210e81e40..0dede3f1d 100644 --- a/setup.cfg +++ b/.flake8 @@ -1,6 +1,3 @@ -[aliases] -test=pytest - [flake8] max-line-length = 100 inline-quotes = " @@ -17,9 +14,3 @@ per-file-ignores = graphblas/tests/*.py:T201, graphblas/core/ss/matrix.py:SIM113, graphblas/**/__init__.py:F401, - -[build_sphinx] -all-files = 1 -source-dir = docs -build-dir = docs/_build -warning-is-error = 1 diff --git a/.github/workflows/test_and_build.yml b/.github/workflows/test_and_build.yml index dbef77075..7c4159379 100644 --- a/.github/workflows/test_and_build.yml +++ b/.github/workflows/test_and_build.yml @@ -290,7 +290,7 @@ jobs: rm script.py # Tests whose coverage depend on order of tests :/ # TODO: understand why these are order-dependent and try to fix - coverage run -a -m pytest --color=yes -x --no-mapnumpy -k test_binaryop_attributes_numpy graphblas/tests/test_op.py + coverage run -a -m pytest --color=yes -x --no-mapnumpy --runslow -k test_binaryop_attributes_numpy graphblas/tests/test_op.py # coverage run -a -m pytest --color=yes -x --no-mapnumpy -k test_npmonoid graphblas/tests/test_numpyops.py --runslow - name: Auto-generated code check if: matrix.slowtask == 'pytest_bizarro' diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9499ceeb5..d2bd720cd 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -56,7 +56,7 @@ repos: # These versions need updated manually - flake8==6.0.0 - flake8-comprehensions==3.10.1 - - flake8-bugbear==23.1.17 + - flake8-bugbear==23.1.20 - flake8-simplify==0.19.3 - repo: https://github.com/asottile/yesqa rev: v1.4.0 @@ -71,10 +71,9 @@ repos: additional_dependencies: [tomli] files: ^(graphblas|docs)/ - repo: https://github.com/charliermarsh/ruff-pre-commit - rev: v0.0.229 + rev: v0.0.233 hooks: - id: ruff - args: [--force-exclude] - repo: https://github.com/sphinx-contrib/sphinx-lint rev: v0.6.7 hooks: diff --git a/graphblas/core/automethods.py b/graphblas/core/automethods.py index 5a3ce0753..5b8fb5726 100644 --- a/graphblas/core/automethods.py +++ b/graphblas/core/automethods.py @@ -269,6 +269,10 @@ def to_dicts(self): return self._get_value("to_dicts") +def to_edgelist(self): + return self._get_value("to_edgelist") + + def to_values(self): return self._get_value("to_values") @@ -409,6 +413,7 @@ def _main(): "to_dcsc", "to_dcsr", "to_dicts", + "to_edgelist", } common_raises = set() scalar_raises = { diff --git a/graphblas/core/infix.py b/graphblas/core/infix.py index 464335d42..22b1c5dca 100644 --- a/graphblas/core/infix.py +++ b/graphblas/core/infix.py @@ -344,6 +344,7 @@ def dup(self, dtype=None, *, clear=False, mask=None, name=None, **opts): to_dcsc = wrapdoc(Matrix.to_dcsc)(property(automethods.to_dcsc)) to_dcsr = wrapdoc(Matrix.to_dcsr)(property(automethods.to_dcsr)) to_dicts = wrapdoc(Matrix.to_dicts)(property(automethods.to_dicts)) + to_edgelist = wrapdoc(Matrix.to_edgelist)(property(automethods.to_edgelist)) to_values = wrapdoc(Matrix.to_values)(property(automethods.to_values)) wait = wrapdoc(Matrix.wait)(property(automethods.wait)) # These raise exceptions diff --git a/graphblas/core/matrix.py b/graphblas/core/matrix.py index 900c0e036..3cd0b6c7f 100644 --- a/graphblas/core/matrix.py +++ b/graphblas/core/matrix.py @@ -494,6 +494,11 @@ def to_coo(self, dtype=None, *, rows=True, columns=True, values=True, sort=True) If internally stored rowwise, the sorting will be first by rows, then by column. If internally stored columnwise, the sorting will be first by column, then by row. + See Also + -------- + to_edgelist + from_coo + Returns ------- np.ndarray[dtype=uint64] : Rows @@ -543,6 +548,35 @@ def to_coo(self, dtype=None, *, rows=True, columns=True, values=True, sort=True) c_values if values else None, ) + def to_edgelist(self, dtype=None, *, values=True, sort=True): + """Extract the indices and values as a 2-tuple of numpy arrays. + + This calls ``to_coo`` then transforms the data into an edgelist. + + Parameters + ---------- + dtype : + Requested dtype for the output values array. + values : bool, default=True + Whether to return values; will return `None` for values if `False` + sort : bool, default=True + Whether to require sorted indices. + If internally stored rowwise, the sorting will be first by rows, then by column. + If internally stored columnwise, the sorting will be first by column, then by row. + + See Also + -------- + to_coo + from_edgelist + + Returns + ------- + np.ndarray[dtype=uint64] : Edgelist + np.ndarray : Values + """ + rows, columns, values = self.to_coo(dtype, values=values, sort=sort) + return (np.column_stack([rows, columns]), values) + def build(self, rows, columns, values, *, dup_op=None, clear=False, nrows=None, ncols=None): """Rarely used method to insert values into an existing Matrix. @@ -827,6 +861,11 @@ def from_coo( name : str, optional Name to give the Matrix. + See Also + -------- + from_edgelist + to_coo + Returns ------- Matrix @@ -864,6 +903,99 @@ def from_coo( C.build(rows, columns, values, dup_op=dup_op) return C + @classmethod + def from_edgelist( + cls, + edgelist, + values=None, + dtype=None, + *, + nrows=None, + ncols=None, + dup_op=None, + name=None, + ): + """Create a new Matrix from edgelist of (row, col) pairs or (row, col, value) triples. + + This transforms the data and calls ``Matrix.from_coo``. + + Parameters + ---------- + edgelist : list or np.ndarray or iterable + A sequence of ``(row, column)`` pairs or ``(row, column, value)`` triples. + NumPy edgelist only supports ``(row, column)``; values must be passed separately. + values : list or np.ndarray or scalar, optional + List of values. If a scalar is provided, all values will be set to this single value. + The default is 1.0 if ``edgelist`` is a sequence of ``(row, column)`` pairs. + dtype : + Data type of the Matrix. If not provided, the values will be inspected + to choose an appropriate dtype. + nrows : int, optional + Number of rows in the Matrix. If not provided, ``nrows`` is computed + from the maximum row index found in the edgelist. + ncols : int, optional + Number of columns in the Matrix. If not provided, ``ncols`` is computed + from the maximum column index found in the edgelist. + dup_op : :class:`~graphblas.core.operator.BinaryOp`, optional + Function used to combine values if duplicate indices are found. + Leaving ``dup_op=None`` will raise an error if duplicates are found. + name : str, optional + Name to give the Matrix. + + See Also + -------- + from_coo + to_edgelist + + Returns + ------- + Matrix + """ + edgelist_values = None + if isinstance(edgelist, np.ndarray): + if edgelist.ndim != 2: + raise ValueError( + f"edgelist array must have 2 dimensions (nvals x 2); got {edgelist.ndim}" + ) + if edgelist.shape[1] == 3: + raise ValueError( + "edgelist as NumPy array only supports ``(row, column)``; " + "values must be passed separately." + ) + if edgelist.shape[1] != 2: + raise ValueError( + "Last dimension of edgelist array must be length 2 " + f"(for row and column); got {edgelist.shape[1]}" + ) + rows = edgelist[:, 0] + cols = edgelist[:, 1] + else: + unzipped = list(zip(*edgelist)) + if len(unzipped) == 2: + rows, cols = unzipped + elif len(unzipped) == 3: + rows, cols, edgelist_values = unzipped + elif not unzipped: + # Empty edgelist (nrows and ncols should be given) + rows = cols = unzipped + else: + raise ValueError( + "Each item in the edgelist must have two or three elements " + f"(for row and column index, and maybe values); got {len(unzipped)}" + ) + if values is None: + if edgelist_values is None: + values = 1.0 + else: + values = edgelist_values + elif edgelist_values is not None: + raise TypeError( + "Too many sources of values: from `edgelist` triples and from `values=` argument" + ) + return cls.from_coo( + rows, cols, values, dtype, nrows=nrows, ncols=ncols, dup_op=dup_op, name=name + ) + @classmethod def _from_csx(cls, fmt, indptr, indices, values, dtype, num, check_num, name): if fmt is _CSR_FORMAT: @@ -987,6 +1119,7 @@ def from_csr( from_coo from_csc from_dcsr + to_csr Matrix.ss.import_csr io.from_scipy_sparse """ @@ -1033,6 +1166,7 @@ def from_csc( from_coo from_csr from_dcsc + to_csc Matrix.ss.import_csc io.from_scipy_sparse """ @@ -1092,6 +1226,7 @@ def from_dcsr( from_coo from_csr from_dcsc + to_dcsr Matrix.ss.import_hypercsr io.from_scipy_sparse """ @@ -1175,6 +1310,7 @@ def from_dcsc( from_coo from_csc from_dcsr + to_dcsc Matrix.ss.import_hypercsc io.from_scipy_sparse """ @@ -1265,6 +1401,10 @@ def from_dicts( name : str, optional Name to give the Matrix. + See Also + -------- + to_dicts + Returns ------- Matrix @@ -1360,6 +1500,7 @@ def to_csr(self, dtype=None): to_coo to_csc to_dcsr + from_csr Matrix.ss.export io.to_scipy_sparse """ @@ -1385,6 +1526,7 @@ def to_csc(self, dtype=None): to_coo to_csr to_dcsc + from_csc Matrix.ss.export io.to_scipy_sparse """ @@ -1413,6 +1555,7 @@ def to_dcsr(self, dtype=None): to_coo to_csr to_dcsc + from_dcsc Matrix.ss.export io.to_scipy_sparse """ @@ -1459,6 +1602,7 @@ def to_dcsc(self, dtype=None): to_coo to_csc to_dcsr + from_dcsc Matrix.ss.export io.to_scipy_sparse """ @@ -1496,6 +1640,10 @@ def to_dicts(self, order="rowwise"): "columnwise" returns dict of dicts as ``{col: {row: val}}``. The default is "rowwise". + See Also + -------- + from_dicts + Returns ------- dict @@ -3102,6 +3250,7 @@ def dup(self, dtype=None, *, clear=False, mask=None, name=None, **opts): to_dcsc = wrapdoc(Matrix.to_dcsc)(property(automethods.to_dcsc)) to_dcsr = wrapdoc(Matrix.to_dcsr)(property(automethods.to_dcsr)) to_dicts = wrapdoc(Matrix.to_dicts)(property(automethods.to_dicts)) + to_edgelist = wrapdoc(Matrix.to_edgelist)(property(automethods.to_edgelist)) to_values = wrapdoc(Matrix.to_values)(property(automethods.to_values)) wait = wrapdoc(Matrix.wait)(property(automethods.wait)) # These raise exceptions @@ -3200,6 +3349,7 @@ def dup(self, dtype=None, *, clear=False, mask=None, name=None, **opts): to_dcsc = wrapdoc(Matrix.to_dcsc)(property(automethods.to_dcsc)) to_dcsr = wrapdoc(Matrix.to_dcsr)(property(automethods.to_dcsr)) to_dicts = wrapdoc(Matrix.to_dicts)(property(automethods.to_dicts)) + to_edgelist = wrapdoc(Matrix.to_edgelist)(property(automethods.to_edgelist)) to_values = wrapdoc(Matrix.to_values)(property(automethods.to_values)) wait = wrapdoc(Matrix.wait)(property(automethods.wait)) # These raise exceptions @@ -3365,6 +3515,7 @@ def to_dicts(self, order="rowwise"): get = Matrix.get isequal = Matrix.isequal isclose = Matrix.isclose + to_edgelist = Matrix.to_edgelist wait = Matrix.wait _extract_element = Matrix._extract_element _prep_for_extract = Matrix._prep_for_extract diff --git a/graphblas/core/vector.py b/graphblas/core/vector.py index 98c940b6a..47a6a1e6a 100644 --- a/graphblas/core/vector.py +++ b/graphblas/core/vector.py @@ -450,6 +450,11 @@ def to_coo(self, dtype=None, *, indices=True, values=True, sort=True): sort : bool, default=True Whether to require sorted indices. + See Also + -------- + to_dict + from_coo + Returns ------- np.ndarray[dtype=uint64] : Indices @@ -709,6 +714,12 @@ def from_coo(cls, indices, values=1.0, dtype=None, *, size=None, dup_op=None, na name : str, optional Name to give the Vector. + See Also + -------- + from_dict + from_pairs + to_coo + Returns ------- Vector @@ -740,6 +751,53 @@ def from_coo(cls, indices, values=1.0, dtype=None, *, size=None, dup_op=None, na w.build(indices, values, dup_op=dup_op) return w + @classmethod + def from_pairs(cls, pairs, dtype=None, *, size=None, dup_op=None, name=None): + """Create a new Vector from indices and values. + + This transforms the data and calls ``Vector.from_coo``. + + Parameters + ---------- + pairs : list or iterable + A sequence of ``(index, value)`` pairs. + dtype : + Data type of the Vector. If not provided, the values will be inspected + to choose an appropriate dtype. + size : int, optional + Size of the Vector. If not provided, ``size`` is computed from + the maximum index found in ``pairs``. + dup_op : BinaryOp, optional + Function used to combine values if duplicate indices are found. + Leaving ``dup_op=None`` will raise an error if duplicates are found. + name : str, optional + Name to give the Vector. + + See Also + -------- + from_coo + from_dict + to_coo + + Returns + ------- + Vector + """ + if isinstance(pairs, np.ndarray): + raise TypeError("pairs as NumPy array is not supported; use `Vector.from_coo` instead") + unzipped = list(zip(*pairs)) + if len(unzipped) == 2: + indices, values = unzipped + elif not unzipped: + # Empty pairs (size should be given) + indices = values = unzipped + else: + raise ValueError( + "Each item in the pairs must have two elements (for index and value); " + f"got {len(unzipped)}" + ) + return cls.from_coo(indices, values, dtype, size=size, dup_op=dup_op, name=name) + @property def _carg(self): return self.gb_obj[0] @@ -1740,6 +1798,12 @@ def from_dict(cls, d, dtype=None, *, size=None, name=None): name : str, optional Name to give the Vector. + See Also + -------- + from_coo + from_pairs + to_dict + Returns ------- Vector @@ -1759,6 +1823,11 @@ def from_dict(cls, d, dtype=None, *, size=None, name=None): def to_dict(self): """Return Vector as a dict in the form ``{index: val}``. + See Also + -------- + to_coo + from_dict + Returns ------- dict diff --git a/graphblas/tests/test_matrix.py b/graphblas/tests/test_matrix.py index ec24a3b1a..68d7a6f9b 100644 --- a/graphblas/tests/test_matrix.py +++ b/graphblas/tests/test_matrix.py @@ -2898,6 +2898,7 @@ def test_expr_is_like_matrix(A): "from_dcsc", "from_dcsr", "from_dicts", + "from_edgelist", "from_values", "resize", "update", @@ -2960,6 +2961,7 @@ def test_index_expr_is_like_matrix(A): "from_dcsc", "from_dcsr", "from_dicts", + "from_edgelist", "from_values", "resize", } @@ -4078,6 +4080,41 @@ def test_from_list_of_dicts(): Matrix.from_dicts(list_of_dicts, ncols=1) +def test_to_from_edgelist(A): + edgelist, values = A.to_edgelist() + result = Matrix.from_edgelist(edgelist, values) + assert result.isequal(A, check_dtype=True) + + result = Matrix.from_edgelist([[0, 1], [2, 3]]) + expected = Matrix.from_coo([0, 2], [1, 3], [1.0, 1.0]) + assert expected.isequal(result, check_dtype=True) + result = Matrix.from_edgelist([[0, 1], [2, 3]], [1.0, 1.0]) + assert expected.isequal(result, check_dtype=True) + result = Matrix.from_edgelist([[0, 1, 1.0], [2, 3, 1.0]]) + assert expected.isequal(result, check_dtype=True) + + result = Matrix.from_edgelist([[0, 1, 10], [2, 3, 20]]) + expected = Matrix.from_coo([0, 2], [1, 3], [10, 20]) + assert expected.isequal(result, check_dtype=True) + with pytest.raises(ValueError, match="values must be passed separately"): + Matrix.from_edgelist(np.array([[0, 1, 10], [2, 3, 20]], dtype=np.int64)) + + result = Matrix.from_edgelist([], nrows=2, ncols=3, dtype=int) + expected = Matrix(int, nrows=2, ncols=3) + assert expected.isequal(result, check_dtype=True) + + with pytest.raises(ValueError, match="Unable to infer nrows"): + Matrix.from_edgelist([]) + with pytest.raises(ValueError, match="edgelist must have two"): + Matrix.from_edgelist([[0, 1, 2, 3], [4, 5, 6, 7]]) + with pytest.raises(ValueError, match="edgelist array must have 2 dimensions"): + Matrix.from_edgelist(np.arange(5)) + with pytest.raises(ValueError, match="edgelist array must be length 2"): + Matrix.from_edgelist(np.arange(24).reshape(6, 4)) + with pytest.raises(TypeError, match="Too many sources of values"): + Matrix.from_edgelist([[0, 1, 10], [2, 3, 20]], values=0) + + @pytest.mark.skipif("not suitesparse") def test_ss_sort(A): A[3, 0] = 9 diff --git a/graphblas/tests/test_op.py b/graphblas/tests/test_op.py index 16ac983cc..483a1b8e8 100644 --- a/graphblas/tests/test_op.py +++ b/graphblas/tests/test_op.py @@ -574,6 +574,7 @@ def plus_plus_one(x, y): Monoid.register_anonymous(binary.plus_plus_one, {"BOOL": -1}) +@pytest.mark.slow def test_semiring_udf(): def plus_plus_two(x, y): return x + y + 2 # pragma: no cover (numba) @@ -731,6 +732,7 @@ def test_op_namespace(): assert selectnames - opnames == selectnames, selectnames - opnames +@pytest.mark.slow def test_binaryop_attributes_numpy(): # Some coverage from this test depends on order of tests assert binary.numpy.add[int].monoid is monoid.numpy.add[int] @@ -1108,6 +1110,7 @@ def test_positional(): assert semiring.ss.any_secondj[int].is_positional +@pytest.mark.slow def test_udt(): record_dtype = np.dtype([("x", np.bool_), ("y", np.float64)], align=True) udt = dtypes.register_new("TestUDT", record_dtype) diff --git a/graphblas/tests/test_vector.py b/graphblas/tests/test_vector.py index 9e4e76fe3..dcd746369 100644 --- a/graphblas/tests/test_vector.py +++ b/graphblas/tests/test_vector.py @@ -1618,6 +1618,7 @@ def test_expr_is_like_vector(v): "clear", "from_coo", "from_dict", + "from_pairs", "from_values", "resize", "update", @@ -1664,6 +1665,7 @@ def test_index_expr_is_like_vector(v): "clear", "from_coo", "from_dict", + "from_pairs", "from_values", "resize", } @@ -2468,6 +2470,23 @@ def test_to_dict(v): assert empty.to_dict() == {} +def test_from_pairs(): + w = Vector.from_pairs([[0, 1], [2, 3]]) + expected = Vector.from_coo([0, 2], [1, 3]) + assert w.isequal(expected, check_dtype=True) + with pytest.raises(TypeError, match="use `Vector.from_coo`"): + Vector.from_pairs(np.array([[0, 1], [2, 3]], dtype=np.int64)) + + w = Vector.from_pairs([], size=4) + expected = Vector(float, size=4) + assert w.isequal(expected, check_dtype=True) + + with pytest.raises(ValueError, match="Unable to infer size"): + Vector.from_pairs([]) + with pytest.raises(ValueError, match="Each item in the pair"): + Vector.from_pairs([[1, 2, 3], [4, 5, 6]]) + + @pytest.mark.skipif("not suitesparse") def test_ss_sort(v): # For equal values, indices are guaranteed to be sorted diff --git a/pyproject.toml b/pyproject.toml index 3151e0b25..46d89aff4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -211,6 +211,7 @@ select = [ "EXE", # flake8-executable "TYP", # flake8-type-checking "TRY", # tryceratops + # "PTH", # flake8-use-pathlib "RUF", # ruff-specific rules ] external = [ @@ -231,6 +232,10 @@ ignore = [ "D401", # First line of docstring should be in imperative mood: "D417", # Missing argument description in the docstring: + # Maybe consider + "TRY004", # Prefer `TypeError` exception for invalid type (Note: good advice, but not worth the nuisance) + "TRY200", # Use `raise from` to specify exception cause (Note: sometimes okay to raise original exception) + # Intentionally ignored "D203", # 1 blank line required before class docstring (Note: conflicts with D211, which is preferred) "SIM102", # Use a single `if` statement instead of nested `if` statements (Note: often necessary) diff --git a/scripts/check_versions.sh b/scripts/check_versions.sh index fb637aa51..b14f62206 100755 --- a/scripts/check_versions.sh +++ b/scripts/check_versions.sh @@ -10,3 +10,6 @@ conda search 'networkx[channel=conda-forge]>=3.0' conda search 'awkward[channel=conda-forge]>=2.0.6' conda search 'numba[channel=conda-forge]>=0.56.4' conda search 'pyyaml[channel=conda-forge]>=6.0' +conda search 'flake8-comprehensions[channel=conda-forge]>=3.10.1' +conda search 'flake8-bugbear[channel=conda-forge]>=23.1.20' +conda search 'flake8-simplify[channel=conda-forge]>=0.19.3'