diff --git a/CHANGELOG.md b/CHANGELOG.md index ca92ff585d..ad24de1e13 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,16 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm ## [Development] +### Breaking + +* `from_cugraph` returns using the src/dst bindings of `cugraph.Graph` object instead of base `Plottable` + +### Feat + +* Switch to `skrub` for feature engineering +* More AI methods support GPU path +* Support cugraph 26.10+ + ## [0.35.4 - 2024-12-28] ### Fixes diff --git a/graphistry/plugins/cugraph.py b/graphistry/plugins/cugraph.py index 2dd048faa2..d922cec06b 100644 --- a/graphistry/plugins/cugraph.py +++ b/graphistry/plugins/cugraph.py @@ -1,5 +1,7 @@ -import pandas as pd from typing import Any, Dict, List, Optional, Union +import pandas as pd +import warnings + from graphistry.constants import NODE from graphistry.Engine import EngineAbstract from graphistry.Plottable import Plottable @@ -8,7 +10,6 @@ logger = setup_logger(__name__) - #import logging #logger.setLevel(logging.DEBUG) @@ -38,7 +39,7 @@ def from_cugraph(self, ) -> Plottable: """ - If bound IDs, use the same IDs in the returned graph. + Take input cugraph.Graph object and load in data and bindings (source, destination, edge_weight) If non-empty nodes/edges, instead of returning G's topology, use existing topology and merge in G's attributes @@ -50,8 +51,28 @@ def from_cugraph(self, #### - src = self._source or SRC_CUGRAPH - dst = self._destination or DST_CUGRAPH + if hasattr(G, 'source_columns') and G.source_columns is not None: + s = G.source_columns + if isinstance(s, list): + s = s[0] + assert isinstance(s, str), "Found G.source_columns, and expected it to be a string or a list of one string, but was: %s" % G.souurce_columns + if self._source is not None and self._source != s: + warnings.warn('Switching g source column name to G source column name') + else: + s = self._source or SRC_CUGRAPH + src = s + + if hasattr(G, 'destination_columns') and G.destination_columns is not None: + d = G.destination_columns + if isinstance(d, list): + d = d[0] + assert isinstance(d, str), "Found G.destination_columns, and expected it to be a string or a list of one string, but was: %s" % G.destination_columns + if self._destination is not None and self._destination != d: + warnings.warn('Switching g destination column name to G destination column name') + else: + d = self._destination or DST_CUGRAPH + dst = d + edges_gdf = G.view_edge_list() # src, dst if g._nodes is not None and load_nodes: @@ -326,7 +347,15 @@ def compute_cugraph_core( out = out[0] if out_col is not None: raise ValueError('Graph returned, but out_col was specified') - return from_cugraph(self, out, load_nodes=False) + self2 = self + if self._source != out.source_columns: + logger.debug('Switching g source column name to G source column name to work around cugraph inconsistency') + if out.source_columns == 'src': + self2 = self.edges(self._edges.rename(columns={self._source: 'src', self._destination: 'dst'}), 'src', 'dst') + res = from_cugraph(self2, out, load_nodes=False) + if not (self2 is self): + res = res.edges(self._edges, self._source, self._destination) + return res raise ValueError('Unsupported algorithm: %s', alg) diff --git a/graphistry/tests/plugins/test_cugraph.py b/graphistry/tests/plugins/test_cugraph.py index 7f22354659..883ac452e0 100644 --- a/graphistry/tests/plugins/test_cugraph.py +++ b/graphistry/tests/plugins/test_cugraph.py @@ -69,8 +69,8 @@ def test_minimal_edges(self): g = graphistry.from_cugraph(G, load_nodes=False) assert g._nodes is None and g._node is None assert g._source is not None and g._destination is not None - assert g._source == SRC_CUGRAPH - assert g._destination == DST_CUGRAPH + assert g._source == 'a' + assert g._destination == 'b' assert g._edges is not None assert isinstance(g._edges, cudf.DataFrame) assert len(g._edges) == len(edges) @@ -88,14 +88,14 @@ def test_minimal_attributed_edges(self): assert g._nodes is None and g._node is None assert len(g._edges) == len(edges) assert g._source is not None and g._destination is not None - assert g._source == SRC_CUGRAPH - assert g._destination == DST_CUGRAPH + assert g._source == 'a' + assert g._destination == 'b' assert g._edges is not None assert isinstance(g._edges, cudf.DataFrame) assert len(g._edges) == len(edges) assert len(g._edges[g._source].dropna()) == len(edges) assert len(g._edges[g._destination].dropna()) == len(edges) - assert (g._edges['weights'].to_pandas() == edges_w['w']).all() + assert (g._edges['w'].to_pandas() == edges_w['w']).all() def test_merge_existing_edges_pandas(self): @@ -191,8 +191,8 @@ def test_minimal_edges(self): logger.debug('G: %s', G) g2 = graphistry.from_cugraph(G) assert g2._edges.shape == g._edges.shape - assert g2._source == SRC_CUGRAPH - assert g2._destination == DST_CUGRAPH + assert g2._source == g._source + assert g2._destination == g._destination assert g2._edge is None assert g2._nodes is None and g2._node is None #logger.debug('g2._nodes: %s', g2._nodes) @@ -249,8 +249,8 @@ def test_minimal_edges_str(self): logger.debug('G: %s', G) g2 = graphistry.from_cugraph(G) assert g2._edges.shape == g._edges.shape - assert g2._source == SRC_CUGRAPH - assert g2._destination == DST_CUGRAPH + assert g2._source == g._source + assert g2._destination == g._destination assert g2._edge is None assert ( g2._edges @@ -283,8 +283,8 @@ def test_nodes(self): logger.debug('ig: %s', G) g2 = graphistry.from_cugraph(G).materialize_nodes() assert g2._edges.shape == g._edges.shape - assert g2._source == SRC_CUGRAPH - assert g2._destination == DST_CUGRAPH + assert g2._source == g._source + assert g2._destination == g._destination assert g2._edge is None assert g2._node == 'id' logger.debug('g2._nodes: %s', g2._nodes) @@ -336,8 +336,8 @@ def test_drop_nodes(self): logger.debug('G: %s', G) g2 = graphistry.from_cugraph(G).materialize_nodes() assert g2._edges.shape == g._edges.shape - assert g2._source == SRC_CUGRAPH - assert g2._destination == DST_CUGRAPH + assert g2._source == g._source + assert g2._destination == g._destination assert g2._edge is None logger.debug('g2._nodes: %s', g2._nodes) logger.debug('other: %s', nodes) @@ -604,6 +604,8 @@ def test_all_calls(self): edges3_gdf = cudf.from_pandas(edges3_df) g = graphistry.edges(edges3_gdf, 'a', 'b').bind(edge_weight='f').materialize_nodes() + assert g._source == 'a' + assert g._destination == 'b' for alg in [x for x in compute_algs]: if alg not in skiplist: opts = overrides[alg] if alg in overrides else {}