Read Matrix Market with fast_matrix_market (python-graphblas#391)

* Read Matrix Market with `fast_matrix_market` * Update usage of `from_dense` and `to_dense` in docs and notebooks * Ignore warning from new version of pydata/sparse (we should investigate later) * `to_pydata_sparse(v)` on Vector should create 1-d array * bump awkward to 2.1.1 * Maybe trust `ruff` to fix some things But run `autoflake`, `isort`, `pyupgrade`, and `black` first (for now). * Add a few notes to pre-commit hooks * Add `fast-matrix-market` to optional dependencies documentation * Drop autoflake in pre-commit (use ruff instead) * Note that `scipy` is needed for all backends for mmread and mmwrite * Add Matrix Market to `io` docs
jim22k · Mar 22, 2023 · 4920dc8 · 4920dc8
1 parent c3baea8
commit 4920dc8
Show file tree

Hide file tree

Showing 18 changed files with 227 additions and 116 deletions.
diff --git a/.github/workflows/test_and_build.yml b/.github/workflows/test_and_build.yml
@@ -143,26 +143,27 @@ jobs:
           nxver=$(python -c 'import random ; print(random.choice(["=2.7", "=2.8", "=3.0", ""]))')
           yamlver=$(python -c 'import random ; print(random.choice(["=5.4", "=6.0", ""]))')
           sparsever=$(python -c 'import random ; print(random.choice(["=0.12", "=0.13", "=0.14", ""]))')
+          fmmver=$(python -c 'import random ; print(random.choice(["=1.4", ""]))')
           if [[ ${{ steps.pyver.outputs.selected }} == "3.8" ]]; then
             npver=$(python -c 'import random ; print(random.choice(["=1.21", "=1.22", "=1.23", ""]))')
             spver=$(python -c 'import random ; print(random.choice(["=1.8", "=1.9", "=1.10", ""]))')
             pdver=$(python -c 'import random ; print(random.choice(["=1.2", "=1.3", "=1.4", "=1.5", ""]))')
-            akver=$(python -c 'import random ; print(random.choice(["=1.9", "=1.10", "=2.0", ""]))')
+            akver=$(python -c 'import random ; print(random.choice(["=1.9", "=1.10", "=2.0", "=2.1", ""]))')
           elif [[ ${{ steps.pyver.outputs.selected }} == "3.9" ]]; then
             npver=$(python -c 'import random ; print(random.choice(["=1.21", "=1.22", "=1.23", ""]))')
             spver=$(python -c 'import random ; print(random.choice(["=1.8", "=1.9", "=1.10", ""]))')
             pdver=$(python -c 'import random ; print(random.choice(["=1.2", "=1.3", "=1.4", "=1.5", ""]))')
-            akver=$(python -c 'import random ; print(random.choice(["=1.9", "=1.10", "=2.0", ""]))')
+            akver=$(python -c 'import random ; print(random.choice(["=1.9", "=1.10", "=2.0", "=2.1", ""]))')
           elif [[ ${{ steps.pyver.outputs.selected }} == "3.10" ]]; then
             npver=$(python -c 'import random ; print(random.choice(["=1.21", "=1.22", "=1.23", ""]))')
             spver=$(python -c 'import random ; print(random.choice(["=1.8", "=1.9", "=1.10", ""]))')
             pdver=$(python -c 'import random ; print(random.choice(["=1.3", "=1.4", "=1.5", ""]))')
-            akver=$(python -c 'import random ; print(random.choice(["=1.9", "=1.10", "=2.0", ""]))')
+            akver=$(python -c 'import random ; print(random.choice(["=1.9", "=1.10", "=2.0", "=2.1", ""]))')
           else  # Python 3.11
             npver=$(python -c 'import random ; print(random.choice(["=1.23", ""]))')
             spver=$(python -c 'import random ; print(random.choice(["=1.9", "=1.10", ""]))')
             pdver=$(python -c 'import random ; print(random.choice(["=1.5", ""]))')
-            akver=$(python -c 'import random ; print(random.choice(["=1.10", "=2.0.5", "=2.0.6", "=2.0.7", "=2.0.8", ""]))')
+            akver=$(python -c 'import random ; print(random.choice(["=1.10", "=2.0", "=2.1", ""]))')
           fi
           if [[ ${{ steps.sourcetype.outputs.selected }} == "source" || ${{ steps.sourcetype.outputs.selected }} == "upstream" ]]; then
             # TODO: there are currently issues with some numpy versions when
@@ -188,7 +189,7 @@ jobs:
 
           # Once we have wheels for all OSes, we can delete the last two lines.
           mamba install packaging pytest coverage coveralls=3.3.1 pytest-randomly cffi donfig pyyaml${yamlver} sparse${sparsever} \
-            pandas${pdver} scipy${spver} numpy${npver} awkward${akver} networkx${nxver} numba${numbaver} \
+            pandas${pdver} scipy${spver} numpy${npver} awkward${akver} networkx${nxver} numba${numbaver} fast_matrix_market${fmmver} \
             ${{ matrix.slowtask == 'pytest_bizarro' && 'black' || '' }} \
             ${{ matrix.slowtask == 'notebooks' && 'matplotlib nbconvert jupyter "ipython>=7"' || '' }} \
             ${{ steps.sourcetype.outputs.selected == 'upstream' && 'cython' || '' }} \

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -24,15 +24,13 @@ repos:
     hooks:
       - id: validate-pyproject
         name: Validate pyproject.toml
-  - repo: https://github.com/myint/autoflake
-    rev: v2.0.1
-    hooks:
-      - id: autoflake
-        args: [--in-place]
+  # We can probably remove `isort` if we come to trust `ruff --fix`,
+  # but we'll need to figure out the configuration to do this in `ruff`
   - repo: https://github.com/pycqa/isort
     rev: 5.12.0
     hooks:
       - id: isort
+  # Let's keep `pyupgrade` even though `ruff --fix` probably does most of it
   - repo: https://github.com/asottile/pyupgrade
     rev: v3.3.1
     hooks:
@@ -48,37 +46,50 @@ repos:
     hooks:
       - id: black
       - id: black-jupyter
+  - repo: https://github.com/charliermarsh/ruff-pre-commit
+    rev: v0.0.257
+    hooks:
+      - id: ruff
+        args: [--fix-only]
+  # Let's keep `flake8` even though `ruff` does much of the same.
+  # `flake8-bugbear` and `flake8-simplify` have caught things missed by `ruff`.
   - repo: https://github.com/PyCQA/flake8
     rev: 6.0.0
     hooks:
       - id: flake8
         additional_dependencies: &flake8_dependencies
         # These versions need updated manually
         - flake8==6.0.0
-        - flake8-comprehensions==3.10.1
-        - flake8-bugbear==23.2.13
+        - flake8-bugbear==23.3.12
         - flake8-simplify==0.19.3
   - repo: https://github.com/asottile/yesqa
     rev: v1.4.0
     hooks:
       - id: yesqa
         additional_dependencies: *flake8_dependencies
   - repo: https://github.com/codespell-project/codespell
-    rev: v2.2.2
+    rev: v2.2.4
     hooks:
       - id: codespell
         types_or: [python, rst, markdown]
         additional_dependencies: [tomli]
         files: ^(graphblas|docs)/
   - repo: https://github.com/charliermarsh/ruff-pre-commit
-    rev: v0.0.252
+    rev: v0.0.257
     hooks:
       - id: ruff
   - repo: https://github.com/sphinx-contrib/sphinx-lint
     rev: v0.6.7
     hooks:
       - id: sphinx-lint
         args: [--enable, all, "--disable=line-too-long,leaked-markup"]
+  # `pyroma` may help keep our package standards up to date if best practices change.
+  # This is probably a "low value" check though and safe to remove if we want faster pre-commit.
+  - repo: https://github.com/regebro/pyroma
+    rev: "4.2"
+    hooks:
+      - id: pyroma
+        args: [-n, "10", .]
   - repo: local
     hooks:
       # Add `--hook-stage manual` to pre-commit command to run (very slow)

diff --git a/README.md b/README.md
@@ -40,7 +40,8 @@ The following are not required by python-graphblas, but may be needed for certai
 - `pandas` – required for nicer `__repr__`;
 - `matplotlib` – required for basic plotting of graphs;
 - `scipy` – used in io module to read/write `scipy.sparse` format;
-- `networkx` – used in `io` module to interface with `networkx` graphs.
+- `networkx` – used in `io` module to interface with `networkx` graphs;
+- `fast-matrix-market` - for faster read/write of Matrix Market files with `gb.io.mmread` and `gb.io.mmwrite`.
 
 ## Description
 Currently works with [SuiteSparse:GraphBLAS](https://github.com/DrTimothyAldenDavis/GraphBLAS), but the goal is to make it work with all implementations of the GraphBLAS spec.

diff --git a/dev-requirements.txt b/dev-requirements.txt
@@ -6,6 +6,7 @@ pyyaml
 pandas
 # For I/O
 awkward
+fast_matrix_market
 networkx
 scipy
 sparse
@@ -16,6 +17,7 @@ matplotlib
 # For linting
 pre-commit
 # For testing
+packaging
 pytest-cov
 # For debugging
 icecream

diff --git a/docs/getting_started/index.rst b/docs/getting_started/index.rst
@@ -34,6 +34,7 @@ to work.
   - `matplotlib <https://matplotlib.org>`__ -- required for basic plotting of graphs
   - `scipy <https://scipy.org/>`__ -- used in ``io`` module to read/write ``scipy.sparse`` format
   - `networkx <https://networkx.org>`__ -- used in ``io`` module to interface with networkx graphs
+  - `fast-matrix-market <https://github.com/alugowski/fast_matrix_market>`__ -- for faster read/write of Matrix Market files with ``gb.io.mmread`` and ``gb.io.mmwrite``
 
 GraphBLAS Fundamentals
 ----------------------

diff --git a/docs/user_guide/io.rst b/docs/user_guide/io.rst
@@ -129,3 +129,19 @@ Note that A is unchanged in the above example.
 The SuiteSparse export has a ``give_ownership`` option. This performs a zero-copy
 move operation and invalidates the original python-graphblas object. When extreme speed is needed or memory is
 too limited to make a copy, this option may be needed.
+
+Matrix Market files
+-------------------
+
+The `Matrix Market file format <https://math.nist.gov/MatrixMarket/formats.html>`_ is a common
+file format for storing sparse arrays in human-readable ASCII.
+Matrix Market files--also called MM files--often use ".mtx" file extension.
+For example, many datasets in MM format can be found in `the SuiteSparse Matrix Collection <https://sparse.tamu.edu/>`_.
+
+Use ``gb.io.mmread()`` to read a Matrix Market file to a python-graphblas Matrix,
+and ``gb.io.mmwrite()`` to write a Matrix to a Matrix Market file.
+These names match the equivalent functions in `scipy.sparse <https://docs.scipy.org/doc/scipy/reference/generated/scipy.io.mmread.html>`_.
+
+``scipy`` is required to be installed to read Matrix Market files.
+If ``fast_matrix_market`` is installed, it will be used by default for
+`much better performance <https://github.com/alugowski/fast_matrix_market>`_.
diff --git a/environment.yml b/environment.yml
@@ -23,7 +23,7 @@ dependencies:
     - pandas
     # For I/O
     - awkward
-    # - fast_matrix_market  # Coming soon...
+    - fast_matrix_market
     - networkx
     - scipy
     - sparse

diff --git a/graphblas/core/expr.py b/graphblas/core/expr.py
@@ -160,9 +160,8 @@ def parse_indices(self, indices, shape):
                 raise TypeError(f"Index for {type(self.obj).__name__} cannot be a tuple")
             # Convert to tuple for consistent processing
             indices = (indices,)
-        else:  # len(shape) == 2
-            if type(indices) is not tuple or len(indices) != 2:
-                raise TypeError(f"Index for {type(self.obj).__name__} must be a 2-tuple")
+        elif type(indices) is not tuple or len(indices) != 2:
+            raise TypeError(f"Index for {type(self.obj).__name__} must be a 2-tuple")
 
         out = []
         for i, idx in enumerate(indices):

diff --git a/graphblas/core/matrix.py b/graphblas/core/matrix.py
@@ -3154,14 +3154,11 @@ def _prep_for_assign(self, resolved_indexes, value, mask, is_submask, replace, o
                             mask = _vanilla_subassign_mask(
                                 self, mask, rowidx, colidx, replace, opts
                             )
+                    elif backend == "suitesparse":
+                        cfunc_name = "GxB_Matrix_subassign_Scalar"
                     else:
-                        if backend == "suitesparse":
-                            cfunc_name = "GxB_Matrix_subassign_Scalar"
-                        else:
-                            cfunc_name = "GrB_Matrix_assign_Scalar"
-                            mask = _vanilla_subassign_mask(
-                                self, mask, rowidx, colidx, replace, opts
-                            )
+                        cfunc_name = "GrB_Matrix_assign_Scalar"
+                        mask = _vanilla_subassign_mask(self, mask, rowidx, colidx, replace, opts)
                     expr_repr = (
                         "[[{2._expr_name} rows], [{4._expr_name} cols]]"
                         f"({mask.name})"

diff --git a/graphblas/core/ss/matrix.py b/graphblas/core/ss/matrix.py
@@ -895,9 +895,8 @@ def _export(self, format=None, *, sort=False, give_ownership=False, raw=False, m
                 if is_iso:
                     if values.size > 1:  # pragma: no branch (suitesparse)
                         values = values[:1]
-                else:
-                    if values.size > nvals:  # pragma: no branch (suitesparse)
-                        values = values[:nvals]
+                elif values.size > nvals:  # pragma: no branch (suitesparse)
+                    values = values[:nvals]
             # Note: nvals is also at `indptr[nrows]`
             rv = {
                 "indptr": indptr,
@@ -937,9 +936,8 @@ def _export(self, format=None, *, sort=False, give_ownership=False, raw=False, m
                 if is_iso:
                     if values.size > 1:  # pragma: no cover (suitesparse)
                         values = values[:1]
-                else:
-                    if values.size > nvals:
-                        values = values[:nvals]
+                elif values.size > nvals:
+                    values = values[:nvals]
             # Note: nvals is also at `indptr[ncols]`
             rv = {
                 "indptr": indptr,
@@ -989,9 +987,8 @@ def _export(self, format=None, *, sort=False, give_ownership=False, raw=False, m
                 if is_iso:
                     if values.size > 1:  # pragma: no cover (suitesparse)
                         values = values[:1]
-                else:
-                    if values.size > nvals:
-                        values = values[:nvals]
+                elif values.size > nvals:
+                    values = values[:nvals]
             # Note: nvals is also at `indptr[nvec]`
             rv = {
                 "indptr": indptr,
@@ -1044,9 +1041,8 @@ def _export(self, format=None, *, sort=False, give_ownership=False, raw=False, m
                 if is_iso:
                     if values.size > 1:  # pragma: no cover (suitesparse)
                         values = values[:1]
-                else:
-                    if values.size > nvals:
-                        values = values[:nvals]
+                elif values.size > nvals:
+                    values = values[:nvals]
             # Note: nvals is also at `indptr[nvec]`
             rv = {
                 "indptr": indptr,
@@ -3480,15 +3476,10 @@ def _import_any(
                     format = "cooc"
                 else:
                     format = "coo"
+            elif isinstance(values, np.ndarray) and values.ndim == 2 and values.flags.f_contiguous:
+                format = "fullc"
             else:
-                if (
-                    isinstance(values, np.ndarray)
-                    and values.ndim == 2
-                    and values.flags.f_contiguous
-                ):
-                    format = "fullc"
-                else:
-                    format = "fullr"
+                format = "fullr"
         else:
             format = format.lower()
         if method == "pack":

diff --git a/graphblas/core/ss/vector.py b/graphblas/core/ss/vector.py
@@ -551,9 +551,8 @@ def _export(self, format=None, *, sort=False, give_ownership=False, raw=False, m
                 if is_iso:
                     if values.size > 1:  # pragma: no cover (suitesparse)
                         values = values[:1]
-                else:
-                    if values.size > nvals:
-                        values = values[:nvals]
+                elif values.size > nvals:
+                    values = values[:nvals]
             rv = {
                 "size": size,
                 "indices": indices,
@@ -589,9 +588,8 @@ def _export(self, format=None, *, sort=False, give_ownership=False, raw=False, m
                 if is_iso:
                     if values.size > 1:  # pragma: no cover (suitesparse)
                         values = values[:1]
-                else:
-                    if values.size > size:  # pragma: no branch (suitesparse)
-                        values = values[:size]
+                elif values.size > size:  # pragma: no branch (suitesparse)
+                    values = values[:size]
             rv = {
                 "bitmap": bitmap,
                 "nvals": nvals[0],
@@ -616,9 +614,8 @@ def _export(self, format=None, *, sort=False, give_ownership=False, raw=False, m
                 if is_iso:
                     if values.size > 1:
                         values = values[:1]
-                else:
-                    if values.size > size:  # pragma: no branch (suitesparse)
-                        values = values[:size]
+                elif values.size > size:  # pragma: no branch (suitesparse)
+                    values = values[:size]
             rv = {}
             if raw or is_iso:
                 rv["size"] = size

diff --git a/graphblas/core/vector.py b/graphblas/core/vector.py
@@ -1868,12 +1868,11 @@ def _prep_for_assign(self, resolved_indexes, value, mask, is_submask, replace, o
                     else:
                         cfunc_name = f"GrB_Vector_assign_{dtype_name}"
                         mask = _vanilla_subassign_mask(self, mask, idx, replace, opts)
+                elif backend == "suitesparse":
+                    cfunc_name = "GxB_Vector_subassign_Scalar"
                 else:
-                    if backend == "suitesparse":
-                        cfunc_name = "GxB_Vector_subassign_Scalar"
-                    else:
-                        cfunc_name = "GrB_Vector_assign_Scalar"
-                        mask = _vanilla_subassign_mask(self, mask, idx, replace, opts)
+                    cfunc_name = "GrB_Vector_assign_Scalar"
+                    mask = _vanilla_subassign_mask(self, mask, idx, replace, opts)
                 expr_repr = (
                     "[[{2._expr_name} elements]]"
                     f"({mask.name})"  # fmt: skip