diff --git a/graphistry/tests/validate/test_validate_graph.py b/graphistry/tests/validate/test_validate_graph.py new file mode 100644 index 0000000000..9c06e7ca5e --- /dev/null +++ b/graphistry/tests/validate/test_validate_graph.py @@ -0,0 +1,46 @@ +from graphistry.validate.validate_graph import validate_graph +import graphistry +import pandas as pd + + +def test_validate_graph_good(): + g = graphistry.edges(pd.DataFrame({'s': ['a', 'b'], 'd': ['b', 'c']}), 's', 'd').nodes( + pd.DataFrame({'id': ['a', 'b', 'c'], 'name': ['A', 'B', 'C']}), node='id') + assert (validate_graph(g) is True) + + +def test_validate_graph_undefined_nodeid(): + g = graphistry.edges(pd.DataFrame({'s': ['a', 'b'], 'd': ['b', 'c']}), 's', 'd').nodes( + pd.DataFrame({'id': ['a', 'b', 'c'], 'name': ['A', 'B', 'C']})) + assert (validate_graph(g) is False) + + +def test_validate_graph_duplicate_nodeid(): + g = graphistry.edges(pd.DataFrame({'s': ['a', 'b'], 'd': ['b', 'c']}), 's', 'd').nodes( + pd.DataFrame({'id': ['a','a', 'b', 'c'], 'name': ['A','A2', 'B', 'C']}), node='id') + assert (validate_graph(g) is False) + + +def test_validate_graph_missing_nodes(): + g = graphistry.edges(pd.DataFrame({'s': ['a', 'b'], 'd': ['b', 'c']})) + assert (validate_graph(g) is False) + + +def test_validate_graph_nan_nodes(): + g = graphistry.edges(pd.DataFrame({'s': ['a', 'b'], 'd': ['b', 'c']}), 's', 'd').nodes( + pd.DataFrame({'id': [None, 'b', 'c'], 'name': ['A', 'B', 'C']}), node='id') + assert (validate_graph(g) is False) + + +def test_validate_graph_missing_src_node(): + # Only returns warning + g = graphistry.edges(pd.DataFrame({'s': ['a', 'b'], 'd': ['b', 'c']}), 's', 'd').nodes( + pd.DataFrame({'id': ['b', 'c'], 'name': ['B', 'C']}), node='id') + assert (validate_graph(g) is True) + + +def test_validate_graph_missing_dst_node(): + # Only returns warning + g = graphistry.edges(pd.DataFrame({'s': ['a', 'b'], 'd': ['b', 'c']}), 's', 'd').nodes( + pd.DataFrame({'id': ['a','b', ], 'name': ['A', 'B']}), node='id') + assert (validate_graph(g) is True) \ No newline at end of file diff --git a/graphistry/validate/validate_graph.py b/graphistry/validate/validate_graph.py new file mode 100644 index 0000000000..60d333c07a --- /dev/null +++ b/graphistry/validate/validate_graph.py @@ -0,0 +1,61 @@ +def check_node_dataframe_exists(g, verbose=True): + if g._nodes is None: + if verbose: + print("Warning: graph was created with only edges. Skipping Node ID check if Node IDs match edge IDs. Use g2 = g.materialize_nodes() to force node df creation. Exiting.") + return False + return True + + +def check_node_id_defined(g, verbose=True): + if g._node is None: + if verbose: + print("Invalid graph: Missing Node ID. Did you forget to specify the node ID in the .nodes() function? Exiting.") + return False + return True + + +def check_nan_node_ids(g, verbose=True): + if g._nodes[g._node].isnull().any(): + if verbose: + print("Invalid graph: Contains NaN Node IDs.") + return False + return True + + +def check_duplicate_node_ids(g, verbose=True): + if g._nodes[g._node].duplicated().any(): + if verbose: + print("Invalid graph: Contains duplicate Node IDs.") + return False + return True + + +def check_edge_sources_exist_in_nodes(g, verbose=True): + if not g._edges[g._source].isin(g._nodes[g._node]).all(): + if verbose: + print("Warning: Contains source edge IDs that do not exist in the node DataFrame. This can cause unexpected results.") + return True + + +def check_edge_destinations_exist_in_nodes(g, verbose=True): + if not g._edges[g._destination].isin(g._nodes[g._node]).all(): + if verbose: + print("Warning: Contains destination edge IDs that do not exist in the node DataFrame. This can cause unexpected results.") + return True + + +def validate_graph(g, verbose=True): + if not check_node_dataframe_exists(g, verbose): + return False + if not check_node_id_defined(g, verbose): + return False + if not check_nan_node_ids(g, verbose): + return False + if not check_duplicate_node_ids(g, verbose): + return False + check_edge_sources_exist_in_nodes(g, verbose) # Warnings only + check_edge_destinations_exist_in_nodes(g, verbose) # Warnings only + + if verbose: + print("Graph is valid.") + return True \ No newline at end of file