Improve construction when inferring sub-array dtype (a.k.a. array sub…

…dtype) (python-graphblas#381) Improve construction when inferring sub-array dtype (a.k.a. array subdtype)
jim22k · Feb 4, 2023 · ed6f3ba · ed6f3ba
1 parent e7a9a31
commit ed6f3ba
Show file tree

Hide file tree

Showing 12 changed files with 154 additions and 47 deletions.
diff --git a/.github/workflows/test_and_build.yml b/.github/workflows/test_and_build.yml
@@ -162,7 +162,7 @@ jobs:
             npver=$(python -c 'import random ; print(random.choice(["=1.23", ""]))')
             spver=$(python -c 'import random ; print(random.choice(["=1.9", "=1.10", ""]))')
             pdver=$(python -c 'import random ; print(random.choice(["=1.5", ""]))')
-            akver=$(python -c 'import random ; print(random.choice(["=1.10", "=2.0.5", "=2.0.6", ""]))')
+            akver=$(python -c 'import random ; print(random.choice(["=1.10", "=2.0.5", "=2.0.6", "=2.0.7", ""]))')
           fi
           if [[ ${{ steps.sourcetype.outputs.selected }} == "source" || ${{ steps.sourcetype.outputs.selected }} == "upstream" ]]; then
             # TODO: there are currently issues with some numpy versions when

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -25,7 +25,7 @@ repos:
       - id: validate-pyproject
         name: Validate pyproject.toml
   - repo: https://github.com/myint/autoflake
-    rev: v2.0.0
+    rev: v2.0.1
     hooks:
       - id: autoflake
         args: [--in-place]
@@ -44,7 +44,7 @@ repos:
       - id: auto-walrus
         args: [--line-length, "100"]
   - repo: https://github.com/psf/black
-    rev: 22.12.0
+    rev: 23.1.0
     hooks:
       - id: black
       - id: black-jupyter
@@ -71,7 +71,7 @@ repos:
         additional_dependencies: [tomli]
         files: ^(graphblas|docs)/
   - repo: https://github.com/charliermarsh/ruff-pre-commit
-    rev: v0.0.237
+    rev: v0.0.241
     hooks:
       - id: ruff
   - repo: https://github.com/sphinx-contrib/sphinx-lint

diff --git a/graphblas/core/matrix.py b/graphblas/core/matrix.py
@@ -872,7 +872,7 @@ def from_coo(
         """
         rows = ints_to_numpy_buffer(rows, np.uint64, name="row indices")
         columns = ints_to_numpy_buffer(columns, np.uint64, name="column indices")
-        values, new_dtype = values_to_numpy_buffer(values, dtype)
+        values, dtype = values_to_numpy_buffer(values, dtype, subarray_after=1)
         # Compute nrows and ncols if not provided
         if nrows is None:
             if rows.size == 0:
@@ -882,11 +882,8 @@ def from_coo(
             if columns.size == 0:
                 raise ValueError("No column indices provided. Unable to infer ncols.")
             ncols = int(columns.max()) + 1
-        if dtype is None and values.ndim > 1:
-            # Look for array-subtdype
-            new_dtype = lookup_dtype(np.dtype((new_dtype.np_type, values.shape[1:])))
         # Create the new matrix
-        C = cls(new_dtype, nrows, ncols, name=name)
+        C = cls(dtype, nrows, ncols, name=name)
         if values.ndim == 0:
             if dup_op is not None:
                 raise ValueError(
@@ -1004,7 +1001,7 @@ def _from_csx(cls, fmt, indptr, indices, values, dtype, num, check_num, name):
             indices_name = "row indices"
         indptr = ints_to_numpy_buffer(indptr, np.uint64, name="index pointers")
         indices = ints_to_numpy_buffer(indices, np.uint64, name=indices_name)
-        values, new_dtype = values_to_numpy_buffer(values, dtype)
+        values, dtype = values_to_numpy_buffer(values, dtype, subarray_after=1)
         if num is None:
             if indices.size > 0:
                 num = int(indices.max()) + 1
@@ -1026,9 +1023,6 @@ def _from_csx(cls, fmt, indptr, indices, values, dtype, num, check_num, name):
                     "ncols must be None or equal to len(indptr) - 1; "
                     f"expected {check_num}, got {ncols}"
                 )
-        if dtype is None and values.ndim > 1:
-            # Look for array-subtdype
-            new_dtype = lookup_dtype(np.dtype((new_dtype.np_type, values.shape[1:])))
         if values.ndim == 0:
             if backend == "suitesparse":
                 # SuiteSparse GxB can handle iso-value
@@ -1055,21 +1049,21 @@ def _from_csx(cls, fmt, indptr, indices, values, dtype, num, check_num, name):
                 )
             values = np.broadcast_to(values, indices.size)
         new_mat = ffi_new("GrB_Matrix*")
-        rv = Matrix._from_obj(new_mat, new_dtype, nrows, ncols, name=name)
-        if new_dtype._is_udt:
+        rv = Matrix._from_obj(new_mat, dtype, nrows, ncols, name=name)
+        if dtype._is_udt:
             dtype_name = "UDT"
         else:
-            dtype_name = new_dtype.name
+            dtype_name = dtype.name
         call(
             f"GrB_Matrix_import_{dtype_name}",
             [
                 _Pointer(rv),
-                new_dtype,
+                dtype,
                 _as_scalar(nrows, _INDEX, is_cscalar=True),
                 _as_scalar(ncols, _INDEX, is_cscalar=True),
                 _CArray(indptr),
                 _CArray(indices),
-                _CArray(values, dtype=new_dtype),
+                _CArray(values, dtype=dtype),
                 _as_scalar(indptr.size, _INDEX, is_cscalar=True),
                 _as_scalar(indices.size, _INDEX, is_cscalar=True),
                 _as_scalar(values.shape[0], _INDEX, is_cscalar=True),
@@ -1436,12 +1430,14 @@ def from_dicts(
         col_indices = np.fromiter(itertools.chain.from_iterable(dicts), np.uint64)
         iter_values = itertools.chain.from_iterable(v.values() for v in dicts)
         if dtype is None:
-            values = np.array(list(iter_values))
-            dtype = lookup_dtype(values.dtype)
+            values, dtype = values_to_numpy_buffer(list(iter_values), subarray_after=1)
         else:
             # If we know the dtype, then using `np.fromiter` is much faster
             dtype = lookup_dtype(dtype)
-            values = np.fromiter(iter_values, dtype.np_type)
+            if dtype.np_type.subdtype is not None and np.__version__[:5] in {"1.21.", "1.22."}:
+                values, dtype = values_to_numpy_buffer(list(iter_values), dtype)
+            else:
+                values = np.fromiter(iter_values, dtype.np_type)
         return getattr(cls, methodname)(
             *args, indptr, col_indices, values, dtype, nrows=nrows, ncols=ncols, name=name
         )

diff --git a/graphblas/core/operator.py b/graphblas/core/operator.py
@@ -1313,7 +1313,7 @@ def _initialize(cls):
                             op._typed_ops[dtype] = typed_op
                             op.coercions[dtype] = target_type
         # Allow some functions to work on UDTs
-        for (unop, func) in [
+        for unop, func in [
             (unary.identity, _identity),
             (unary.one, _one),
         ]:
@@ -2287,7 +2287,7 @@ def _initialize(cls):
         # If the inputs are FP32, we use DIV_FP32; use DIV_FP64 for all other input dtypes
         truediv = binary.truediv = op.truediv = BinaryOp("truediv")
         rtruediv = binary.rtruediv = op.rtruediv = BinaryOp("rtruediv")
-        for (new_op, builtin_op) in [(truediv, binary.cdiv), (rtruediv, binary.rdiv)]:
+        for new_op, builtin_op in [(truediv, binary.cdiv), (rtruediv, binary.rdiv)]:
             for dtype in builtin_op.types:
                 if dtype.name in {"FP32", "FC32", "FC64"}:
                     orig_dtype = dtype
@@ -2420,7 +2420,7 @@ def _initialize(cls):
             left._semiring_commutes_to = right
             right._semiring_commutes_to = left
         # Allow some functions to work on UDTs
-        for (binop, func) in [
+        for binop, func in [
             (binary.first, _first),
             (binary.second, _second),
             (binary.pair, _pair),

diff --git a/graphblas/core/ss/matrix.py b/graphblas/core/ss/matrix.py
@@ -1305,7 +1305,9 @@ def _import_csr(
         )
         if method == "pack":
             dtype = matrix.dtype
-        values, dtype = values_to_numpy_buffer(values, dtype, copy=copy, ownable=True)
+        values, dtype = values_to_numpy_buffer(
+            values, dtype, copy=copy, ownable=True, subarray_after=1
+        )
         if col_indices is values:
             values = np.copy(values)
         Ap = ffi_new("GrB_Index**", ffi.from_buffer("GrB_Index*", indptr))
@@ -1493,7 +1495,9 @@ def _import_csc(
         )
         if method == "pack":
             dtype = matrix.dtype
-        values, dtype = values_to_numpy_buffer(values, dtype, copy=copy, ownable=True)
+        values, dtype = values_to_numpy_buffer(
+            values, dtype, copy=copy, ownable=True, subarray_after=1
+        )
         if row_indices is values:
             values = np.copy(values)
         Ap = ffi_new("GrB_Index**", ffi.from_buffer("GrB_Index*", indptr))
@@ -1696,7 +1700,9 @@ def _import_hypercsr(
         )
         if method == "pack":
             dtype = matrix.dtype
-        values, dtype = values_to_numpy_buffer(values, dtype, copy=copy, ownable=True)
+        values, dtype = values_to_numpy_buffer(
+            values, dtype, copy=copy, ownable=True, subarray_after=1
+        )
         if not is_iso and values.ndim == 0:
             is_iso = True
         if col_indices is values:
@@ -1917,7 +1923,9 @@ def _import_hypercsc(
         )
         if method == "pack":
             dtype = matrix.dtype
-        values, dtype = values_to_numpy_buffer(values, dtype, copy=copy, ownable=True)
+        values, dtype = values_to_numpy_buffer(
+            values, dtype, copy=copy, ownable=True, subarray_after=1
+        )
         if row_indices is values:
             values = np.copy(values)
         if not is_iso and values.ndim == 0:
@@ -2122,7 +2130,9 @@ def _import_bitmapr(
         )
         if method == "pack":
             dtype = matrix.dtype
-        values, dtype = values_to_numpy_buffer(values, dtype, copy=copy, ownable=True, order="C")
+        values, dtype = values_to_numpy_buffer(
+            values, dtype, copy=copy, ownable=True, order="C", subarray_after=2
+        )
         if bitmap is values:
             values = np.copy(values)
         if method == "import":
@@ -2313,7 +2323,9 @@ def _import_bitmapc(
         )
         if method == "pack":
             dtype = matrix.dtype
-        values, dtype = values_to_numpy_buffer(values, dtype, copy=copy, ownable=True, order="F")
+        values, dtype = values_to_numpy_buffer(
+            values, dtype, copy=copy, ownable=True, order="F", subarray_after=2
+        )
         if bitmap is values:
             values = np.copy(values)
         if method == "import":
@@ -2486,7 +2498,9 @@ def _import_fullr(
         copy = not take_ownership
         if method == "pack":
             dtype = matrix.dtype
-        values, dtype = values_to_numpy_buffer(values, dtype, copy=copy, order="C", ownable=True)
+        values, dtype = values_to_numpy_buffer(
+            values, dtype, copy=copy, order="C", ownable=True, subarray_after=2
+        )
         if method == "import":
             nrows, ncols = get_shape(nrows, ncols, dtype, values=values)
         else:
@@ -2643,7 +2657,9 @@ def _import_fullc(
         copy = not take_ownership
         if method == "pack":
             dtype = matrix.dtype
-        values, dtype = values_to_numpy_buffer(values, dtype, copy=copy, order="F", ownable=True)
+        values, dtype = values_to_numpy_buffer(
+            values, dtype, copy=copy, order="F", ownable=True, subarray_after=2
+        )
         if method == "import":
             nrows, ncols = get_shape(nrows, ncols, dtype, values=values)
         else:
@@ -2848,7 +2864,7 @@ def _import_coo(
 
         if method == "pack":
             dtype = matrix.dtype
-        values, dtype = values_to_numpy_buffer(values, dtype)
+        values, dtype = values_to_numpy_buffer(values, dtype, subarray_after=1)
         if method == "import":
             matrix = gb.Matrix(dtype, nrows=nrows, ncols=ncols, name=name)
         if is_iso:

diff --git a/graphblas/core/ss/vector.py b/graphblas/core/ss/vector.py
@@ -970,7 +970,9 @@ def _import_sparse(
         indices = ints_to_numpy_buffer(indices, np.uint64, copy=copy, ownable=True, name="indices")
         if method == "pack":
             dtype = vector.dtype
-        values, dtype = values_to_numpy_buffer(values, dtype, copy=copy, ownable=True)
+        values, dtype = values_to_numpy_buffer(
+            values, dtype, copy=copy, ownable=True, subarray_after=1
+        )
         if indices is values:
             values = np.copy(values)
         vi = ffi_new("GrB_Index**", ffi.from_buffer("GrB_Index*", indices))
@@ -1150,7 +1152,9 @@ def _import_bitmap(
         if method == "pack":
             dtype = vector.dtype
             size = vector._size
-        values, dtype = values_to_numpy_buffer(values, dtype, copy=copy, ownable=True)
+        values, dtype = values_to_numpy_buffer(
+            values, dtype, copy=copy, ownable=True, subarray_after=1
+        )
         if bitmap is values:
             values = np.copy(values)
         vhandle = ffi_new("GrB_Vector*")
@@ -1320,7 +1324,9 @@ def _import_full(
         if method == "pack":
             dtype = vector.dtype
             size = vector._size
-        values, dtype = values_to_numpy_buffer(values, dtype, copy=copy, ownable=True)
+        values, dtype = values_to_numpy_buffer(
+            values, dtype, copy=copy, ownable=True, subarray_after=1
+        )
         vhandle = ffi_new("GrB_Vector*")
         vx = ffi_new("void**", ffi.from_buffer("void*", values))
         if size is None:

diff --git a/graphblas/core/utils.py b/graphblas/core/utils.py
@@ -72,7 +72,21 @@ def _get_subdtype(dtype):
     return dtype
 
 
-def values_to_numpy_buffer(array, dtype=None, *, copy=False, ownable=False, order="C"):
+def values_to_numpy_buffer(
+    array, dtype=None, *, copy=False, ownable=False, order="C", subarray_after=None
+):
+    """Convert an array-like object to a numpy array and infer the dtype if necessary.
+
+    Parameters
+    ----------
+    subarray_after : int, optional
+        If dtype is not provided, infer "sub-array" dtype if the array has extra dimensions.
+
+    Returns
+    -------
+    np.ndarray
+    dtype
+    """
     if dtype is not None:
         dtype = lookup_dtype(dtype)
         array = np.array(array, _get_subdtype(dtype.np_type), copy=copy, order=order)
@@ -85,6 +99,8 @@ def values_to_numpy_buffer(array, dtype=None, *, copy=False, ownable=False, orde
             # fix for win64 numpy handling of ints
             array = array.astype(np.int64)
         dtype = lookup_dtype(array.dtype)
+        if subarray_after is not None and array.ndim > subarray_after:
+            dtype = lookup_dtype(np.dtype((dtype.np_type, array.shape[subarray_after:])))
     if ownable and (not array.flags.owndata or not array.flags.writeable):
         array = array.copy(order)
     return array, dtype

diff --git a/graphblas/core/vector.py b/graphblas/core/vector.py
@@ -725,17 +725,14 @@ def from_coo(cls, indices, values=1.0, dtype=None, *, size=None, dup_op=None, na
         Vector
         """
         indices = ints_to_numpy_buffer(indices, np.uint64, name="indices")
-        values, new_dtype = values_to_numpy_buffer(values, dtype)
+        values, dtype = values_to_numpy_buffer(values, dtype, subarray_after=1)
         # Compute size if not provided
         if size is None:
             if indices.size == 0:
                 raise ValueError("No indices provided. Unable to infer size.")
             size = int(indices.max()) + 1
-        if dtype is None and values.ndim > 1:
-            # Look for array-subtdype
-            new_dtype = lookup_dtype(np.dtype((new_dtype.np_type, values.shape[1:])))
         # Create the new vector
-        w = cls(new_dtype, size, name=name)
+        w = cls(dtype, size, name=name)
         if values.ndim == 0:
             if dup_op is not None:
                 raise ValueError(
@@ -1791,12 +1788,14 @@ def from_dict(cls, d, dtype=None, *, size=None, name=None):
         """
         indices = np.fromiter(d.keys(), np.uint64)
         if dtype is None:
-            values = np.array(list(d.values()))  # let numpy infer dtype
-            dtype = lookup_dtype(values.dtype)
+            values, dtype = values_to_numpy_buffer(list(d.values()), subarray_after=1)
         else:
             # If we know the dtype, then using `np.fromiter` is much faster
             dtype = lookup_dtype(dtype)
-            values = np.fromiter(d.values(), dtype.np_type)
+            if dtype.np_type.subdtype is not None and np.__version__[:5] in {"1.21.", "1.22."}:
+                values, dtype = values_to_numpy_buffer(list(d.values()), dtype)
+            else:
+                values = np.fromiter(d.values(), dtype.np_type)
         if size is None and indices.size == 0:
             size = 0
         return cls.from_coo(indices, values, dtype, size=size, name=name)

diff --git a/graphblas/tests/test_matrix.py b/graphblas/tests/test_matrix.py
@@ -3612,6 +3612,7 @@ def test_ss_iteration(A):
     assert next(A.ss.iteritems()) is not None
 
 
+@pytest.mark.slow
 def test_udt():
     record_dtype = np.dtype([("x", np.bool_), ("y", np.float64)], align=True)
     udt = dtypes.register_anonymous(record_dtype, "MatrixUDT")
@@ -4231,3 +4232,37 @@ def test_ss_descriptors(A):
     else:
         with pytest.raises(ValueError, match="escriptor"):
             (A @ A).new(nthreads=4, axb_method="dot", sort=True)
+
+
+def test_subarray_dtypes():
+    a = np.arange(3 * 4, dtype=np.int64).reshape(3, 4)
+    A = Matrix.from_coo([1, 3, 5], [0, 1, 3], a)
+    B = Matrix("INT64[4]", nrows=6, ncols=4)
+    B[1, 0] = [0, 1, 2, 3]
+    B[3, 1] = [4, 5, 6, 7]
+    B[5, 3] = [8, 9, 10, 11]
+    assert A.isequal(B, check_dtype=True)
+    for method in ["coo", "csr", "csc", "dcsr", "dcsc", "edgelist"]:
+        B = getattr(A, f"from_{method}")(*getattr(A, f"to_{method}")())
+    B = Matrix.from_dicts(A.to_dicts())
+    assert A.isequal(B, check_dtype=True)
+    B = Matrix.from_dicts(A.to_dicts(), A.dtype)
+    assert A.isequal(B, check_dtype=True)
+
+    b1 = np.arange(2 * 3 * 4, dtype=np.int64).reshape(2 * 3, 4)
+    b2 = np.arange(2 * 3 * 4, dtype=np.int64).reshape(2, 3, 4)
+    Full1 = Matrix.from_coo([0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2], b1)
+    Full2 = Matrix("INT64[4]", nrows=2, ncols=3)
+    Full2[0, 0] = [0, 1, 2, 3]
+    Full2[0, 1] = [4, 5, 6, 7]
+    Full2[0, 2] = [8, 9, 10, 11]
+    Full2[1, 0] = [12, 13, 14, 15]
+    Full2[1, 1] = [16, 17, 18, 19]
+    Full2[1, 2] = [20, 21, 22, 23]
+    assert Full1.isequal(Full2, check_dtype=True)
+    Full2 = Matrix("INT64[4]", nrows=2, ncols=3)
+    Full2[:, :] = b2
+    assert Full1.isequal(Full2, check_dtype=True)
+    if suitesparse:
+        Full2 = Matrix.ss.import_fullr(b2)
+        assert Full1.isequal(Full2, check_dtype=True)