From f6ac839b48dfda408bf424064ca2b6b4179e128b Mon Sep 17 00:00:00 2001 From: Thomas Cook Date: Fri, 20 Dec 2024 20:21:56 -0600 Subject: [PATCH 01/33] (feature): new support for Google Spanner Graph --- graphistry/PlotterBase.py | 2 + graphistry/database_clients/SpannerGraph.py | 287 ++++++++++++++++++++ graphistry/pygraphistry.py | 18 ++ 3 files changed, 307 insertions(+) create mode 100644 graphistry/database_clients/SpannerGraph.py diff --git a/graphistry/PlotterBase.py b/graphistry/PlotterBase.py index 00bce624b1..9aed7aeab8 100644 --- a/graphistry/PlotterBase.py +++ b/graphistry/PlotterBase.py @@ -38,6 +38,7 @@ from .arrow_uploader import ArrowUploader from .nodexlistry import NodeXLGraphistry from .tigeristry import Tigeristry +from .database_clients.spannergraph import spannergraph from .util import setup_logger logger = setup_logger(__name__) @@ -176,6 +177,7 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: # Integrations self._bolt_driver : Any = None self._tigergraph : Any = None + self._spannergraph : Any = None # feature engineering self._node_embedding = None diff --git a/graphistry/database_clients/SpannerGraph.py b/graphistry/database_clients/SpannerGraph.py new file mode 100644 index 0000000000..d4e47b09c5 --- /dev/null +++ b/graphistry/database_clients/SpannerGraph.py @@ -0,0 +1,287 @@ +import os +import pandas as pd +import json +import time +import logging +from typing import Any +from google.cloud.spanner_dbapi.connection import connect +from google.cloud.spanner_v1.data_types import JsonObject +from google.cloud.spanner_v1 import KeySet + +logging.basicConfig(level=logging.INFO) + +class QueryResult: + """ + Encapsulates the results of a query, including metadata. + + Attributes: + data (list): The raw query results. + execution_time (float): The time taken to execute the query. + record_count (int): The number of records returned. + """ + + def __init__(self, data: list, execution_time: float): + self.data = data + self.execution_time = execution_time + self.record_count = len(data) + + def summary(self) -> dict: + """ + Provides a summary of the query execution. + + Returns: + dict: A summary of the query results. + """ + return { + "execution_time": self.execution_time, + "record_count": self.record_count + } + +class spannergraph: + """ + A comprehensive interface for interacting with Google Spanner Graph databases. + + This class provides methods for connecting to the database, executing queries, + processing graph data into structured formats, and constructing visualizations + with the Graphistry library. + + Attributes: + project_id (str): The Google Cloud project ID. + instance_id (str): The Spanner instance ID. + database_id (str): The Spanner database ID. + connection: The active connection to the Spanner database. + graphistry: The Graphistry parent object. + + Key Methods: + __connect: Establish a connection to the Spanner database. + execute_query: Run a GQL query against the database. + parse_spanner_json: Convert raw Spanner JSON results into structured data. + get_nodes_df: Extract and structure graph nodes into a pandas DataFrame. + get_edges_df: Extract and structure graph edges into a pandas DataFrame. + gql_to_graph: Combine the above methods to generate a Graphistry visualization. + get_schema: Retrieve the schema of the database. + validate_data: Validate input data for queries or updates. + dump_config: Returns the configuration of the spannergraph instance. + """ + + def __init__(self, graphistry, project_id: str, instance_id: str, database_id: str): + """ + Initializes the spannergraph instance. + + Args: + graphistry: The Graphistry parent object. + project_id (str): The Google Cloud project ID. + instance_id (str): The Spanner instance ID. + database_id (str): The Spanner database ID. + """ + self.graphistry = graphistry + self.project_id = project_id + self.instance_id = instance_id + self.database_id = database_id + self.connection = self.__connect() + + def __connect(self) -> Any: + """ + Establish a connection to the Spanner database. + + Returns: + Any: A connection object to the Spanner database. + + Raises: + ConnectionError: If the connection to Spanner fails. + """ + try: + connection = connect(self.instance_id, self.database_id) + connection.autocommit = True + logging.info("Connected to Spanner database.") + return connection + except Exception as e: + raise ConnectionError(f"Failed to connect to Spanner: {e}") + + def close_connection(self) -> None: + """ + Closes the connection to the Spanner database. + """ + if self.connection: + self.connection.close() + logging.info("Connection to Spanner database closed.") + + def execute_query(self, query: str) -> QueryResult: + """ + Executes a GQL query on the Spanner database. + + Args: + query (str): The GQL query to execute. + + Returns: + QueryResult: The results of the query execution. + + Raises: + RuntimeError: If the query execution fails. + """ + try: + start_time = time.time() + cursor = self.connection.cursor() + cursor.execute(query) + results = cursor.fetchall() + execution_time = time.time() - start_time + logging.info(f"Query executed in {execution_time:.4f} seconds.") + return QueryResult(results, execution_time) + except Exception as e: + raise RuntimeError(f"Query execution failed: {e}") + + @staticmethod + def parse_spanner_json(query_result: QueryResult) -> list: + """ + Converts Spanner JSON graph data into structured Python objects. + + Args: + query_result (QueryResult): The results of the executed query. + + Returns: + list: A list of dictionaries containing nodes and edges. + """ + data = [ query_result.data ] + json_list = [] + for record in data: + for item in record: + json_entry = {"nodes": [], "edges": []} + elements = json.loads(item.serialize()) if isinstance(item, JsonObject) else item + for element in elements: + if element.get('kind') == 'node': + for label in element.get('labels', []): + json_entry["nodes"].append({ + "label": label, + "identifier": element.get('identifier'), + "properties": element.get('properties', {}) + }) + elif element.get('kind') == 'edge': + for label in element.get('labels', []): + json_entry["edges"].append({ + "label": label, + "identifier": element.get('identifier'), + "source": element.get('source_node_identifier'), + "destination": element.get('destination_node_identifier'), + "properties": element.get('properties', {}) + }) + if json_entry["nodes"] or json_entry["edges"]: + json_list.append(json_entry) + return json_list + + @staticmethod + def get_nodes_df(json_data: list) -> pd.DataFrame: + """ + Converts graph nodes into a pandas DataFrame. + + Args: + json_data (list): The structured JSON data containing graph nodes. + + Returns: + pd.DataFrame: A DataFrame containing node information. + """ + nodes = [ + {"label": node["label"], "identifier": node["identifier"], **node["properties"]} + for entry in json_data + for node in entry["nodes"] + ] + nodes_df = pd.DataFrame(nodes).drop_duplicates() + nodes_df['type'] = nodes_df['label'] + return nodes_df + + @staticmethod + def get_edges_df(json_data: list) -> pd.DataFrame: + """ + Converts graph edges into a pandas DataFrame. + + Args: + json_data (list): The structured JSON data containing graph edges. + + Returns: + pd.DataFrame: A DataFrame containing edge information. + """ + edges = [ + { + "label": edge["label"], + "identifier": edge["identifier"], + "source": edge["source"], + "destination": edge["destination"], + **edge["properties"] + } + for entry in json_data + for edge in entry["edges"] + ] + edges_df = pd.DataFrame(edges).drop_duplicates() + edges_df['type'] = edges_df['label'] + return edges_df + + def gql_to_graph(self, query: str) -> Any: + """ + Executes a query and constructs a graphistry graph from the results. + + Args: + query (str): The GQL query to execute. + + Returns: + Any: A graphistry graph object constructed from the query results. + """ + query_result = self.execute_query(query) + json_data = self.parse_spanner_json(query_result) + nodes_df = self.get_nodes_df(json_data) + edges_df = self.get_edges_df(json_data) + g = self.graphistry.nodes(nodes_df, 'identifier').edges(edges_df, 'source', 'destination') + return g + + def get_schema(self) -> dict: + """ + Retrieves the schema of the Spanner database. + + Returns: + dict: A dictionary containing table names and column details. + """ + schema = {} + try: + cursor = self.connection.cursor() + cursor.execute("SELECT table_name, column_name, spanner_type FROM information_schema.columns") + for row in cursor.fetchall(): + table_name, column_name, spanner_type = row + if table_name not in schema: + schema[table_name] = [] + schema[table_name].append({"column_name": column_name, "type": spanner_type}) + logging.info("Database schema retrieved successfully.") + except Exception as e: + logging.error(f"Failed to retrieve schema: {e}") + return schema + + def validate_data(self, data: dict, schema: dict) -> bool: + """ + Validates input data against the database schema. + + Args: + data (dict): The data to validate. + schema (dict): The schema of the database. + + Returns: + bool: True if the data is valid, False otherwise. + """ + for table, columns in data.items(): + if table not in schema: + logging.error(f"Table {table} does not exist in schema.") + return False + for record in columns: + for key in record.keys(): + if key not in [col["column_name"] for col in schema[table]]: + logging.error(f"Column {key} is not valid for table {table}.") + return False + logging.info("Data validation passed.") + return True + + def dump_config(self) -> dict: + """ + Returns the current configuration of the spannergraph instance. + + Returns: + dict: A dictionary containing configuration details. + """ + return { + "project_id": self + } diff --git a/graphistry/pygraphistry.py b/graphistry/pygraphistry.py index aac4974348..bf7521f99d 100644 --- a/graphistry/pygraphistry.py +++ b/graphistry/pygraphistry.py @@ -1824,6 +1824,23 @@ def tigergraph( protocol, server, web_port, api_port, db, user, pwd, verbose ) + + @staticmethod + def spannergraph(project_id, instance_id, database_id): + """ + Create a new PlotterBase instance with SpannerGraph configured. + + Args: + project_id (str): Google Cloud project ID. + instance_id (str): Spanner instance ID. + database_id (str): Spanner database ID. + + Returns: + PlotterBase: A PlotterBase instance configured with SpannerGraph. + """ + return Plotter().spannergraph(project_id, instance_id, database_id) + + @staticmethod def gsql_endpoint( self, method_name, args={}, bindings=None, db=None, dry_run=False @@ -2485,6 +2502,7 @@ def _handle_api_response(response): cypher = PyGraphistry.cypher nodexl = PyGraphistry.nodexl tigergraph = PyGraphistry.tigergraph +spannergraph = PyGraphistry.spannergraph cosmos = PyGraphistry.cosmos neptune = PyGraphistry.neptune gremlin = PyGraphistry.gremlin From fbfff6c5168ebd8ca1db15eba69341da58419532 Mon Sep 17 00:00:00 2001 From: Thomas Cook Date: Tue, 24 Dec 2024 13:42:54 -0600 Subject: [PATCH 02/33] renamed spannergraph file --- graphistry/database_clients/{SpannerGraph.py => spannergraph.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename graphistry/database_clients/{SpannerGraph.py => spannergraph.py} (100%) diff --git a/graphistry/database_clients/SpannerGraph.py b/graphistry/database_clients/spannergraph.py similarity index 100% rename from graphistry/database_clients/SpannerGraph.py rename to graphistry/database_clients/spannergraph.py From 53bd5ab3bb6d3d2a506fd62ed8a88776066aa8d3 Mon Sep 17 00:00:00 2001 From: Thomas Cook Date: Thu, 9 Jan 2025 21:39:53 -0600 Subject: [PATCH 03/33] changed dir for spannergraph.py, added module in setup.py --- graphistry/plugins/spannergraph.py | 287 +++++++++++++++++++++++++++++ setup.py | 1 + 2 files changed, 288 insertions(+) create mode 100644 graphistry/plugins/spannergraph.py diff --git a/graphistry/plugins/spannergraph.py b/graphistry/plugins/spannergraph.py new file mode 100644 index 0000000000..d4e47b09c5 --- /dev/null +++ b/graphistry/plugins/spannergraph.py @@ -0,0 +1,287 @@ +import os +import pandas as pd +import json +import time +import logging +from typing import Any +from google.cloud.spanner_dbapi.connection import connect +from google.cloud.spanner_v1.data_types import JsonObject +from google.cloud.spanner_v1 import KeySet + +logging.basicConfig(level=logging.INFO) + +class QueryResult: + """ + Encapsulates the results of a query, including metadata. + + Attributes: + data (list): The raw query results. + execution_time (float): The time taken to execute the query. + record_count (int): The number of records returned. + """ + + def __init__(self, data: list, execution_time: float): + self.data = data + self.execution_time = execution_time + self.record_count = len(data) + + def summary(self) -> dict: + """ + Provides a summary of the query execution. + + Returns: + dict: A summary of the query results. + """ + return { + "execution_time": self.execution_time, + "record_count": self.record_count + } + +class spannergraph: + """ + A comprehensive interface for interacting with Google Spanner Graph databases. + + This class provides methods for connecting to the database, executing queries, + processing graph data into structured formats, and constructing visualizations + with the Graphistry library. + + Attributes: + project_id (str): The Google Cloud project ID. + instance_id (str): The Spanner instance ID. + database_id (str): The Spanner database ID. + connection: The active connection to the Spanner database. + graphistry: The Graphistry parent object. + + Key Methods: + __connect: Establish a connection to the Spanner database. + execute_query: Run a GQL query against the database. + parse_spanner_json: Convert raw Spanner JSON results into structured data. + get_nodes_df: Extract and structure graph nodes into a pandas DataFrame. + get_edges_df: Extract and structure graph edges into a pandas DataFrame. + gql_to_graph: Combine the above methods to generate a Graphistry visualization. + get_schema: Retrieve the schema of the database. + validate_data: Validate input data for queries or updates. + dump_config: Returns the configuration of the spannergraph instance. + """ + + def __init__(self, graphistry, project_id: str, instance_id: str, database_id: str): + """ + Initializes the spannergraph instance. + + Args: + graphistry: The Graphistry parent object. + project_id (str): The Google Cloud project ID. + instance_id (str): The Spanner instance ID. + database_id (str): The Spanner database ID. + """ + self.graphistry = graphistry + self.project_id = project_id + self.instance_id = instance_id + self.database_id = database_id + self.connection = self.__connect() + + def __connect(self) -> Any: + """ + Establish a connection to the Spanner database. + + Returns: + Any: A connection object to the Spanner database. + + Raises: + ConnectionError: If the connection to Spanner fails. + """ + try: + connection = connect(self.instance_id, self.database_id) + connection.autocommit = True + logging.info("Connected to Spanner database.") + return connection + except Exception as e: + raise ConnectionError(f"Failed to connect to Spanner: {e}") + + def close_connection(self) -> None: + """ + Closes the connection to the Spanner database. + """ + if self.connection: + self.connection.close() + logging.info("Connection to Spanner database closed.") + + def execute_query(self, query: str) -> QueryResult: + """ + Executes a GQL query on the Spanner database. + + Args: + query (str): The GQL query to execute. + + Returns: + QueryResult: The results of the query execution. + + Raises: + RuntimeError: If the query execution fails. + """ + try: + start_time = time.time() + cursor = self.connection.cursor() + cursor.execute(query) + results = cursor.fetchall() + execution_time = time.time() - start_time + logging.info(f"Query executed in {execution_time:.4f} seconds.") + return QueryResult(results, execution_time) + except Exception as e: + raise RuntimeError(f"Query execution failed: {e}") + + @staticmethod + def parse_spanner_json(query_result: QueryResult) -> list: + """ + Converts Spanner JSON graph data into structured Python objects. + + Args: + query_result (QueryResult): The results of the executed query. + + Returns: + list: A list of dictionaries containing nodes and edges. + """ + data = [ query_result.data ] + json_list = [] + for record in data: + for item in record: + json_entry = {"nodes": [], "edges": []} + elements = json.loads(item.serialize()) if isinstance(item, JsonObject) else item + for element in elements: + if element.get('kind') == 'node': + for label in element.get('labels', []): + json_entry["nodes"].append({ + "label": label, + "identifier": element.get('identifier'), + "properties": element.get('properties', {}) + }) + elif element.get('kind') == 'edge': + for label in element.get('labels', []): + json_entry["edges"].append({ + "label": label, + "identifier": element.get('identifier'), + "source": element.get('source_node_identifier'), + "destination": element.get('destination_node_identifier'), + "properties": element.get('properties', {}) + }) + if json_entry["nodes"] or json_entry["edges"]: + json_list.append(json_entry) + return json_list + + @staticmethod + def get_nodes_df(json_data: list) -> pd.DataFrame: + """ + Converts graph nodes into a pandas DataFrame. + + Args: + json_data (list): The structured JSON data containing graph nodes. + + Returns: + pd.DataFrame: A DataFrame containing node information. + """ + nodes = [ + {"label": node["label"], "identifier": node["identifier"], **node["properties"]} + for entry in json_data + for node in entry["nodes"] + ] + nodes_df = pd.DataFrame(nodes).drop_duplicates() + nodes_df['type'] = nodes_df['label'] + return nodes_df + + @staticmethod + def get_edges_df(json_data: list) -> pd.DataFrame: + """ + Converts graph edges into a pandas DataFrame. + + Args: + json_data (list): The structured JSON data containing graph edges. + + Returns: + pd.DataFrame: A DataFrame containing edge information. + """ + edges = [ + { + "label": edge["label"], + "identifier": edge["identifier"], + "source": edge["source"], + "destination": edge["destination"], + **edge["properties"] + } + for entry in json_data + for edge in entry["edges"] + ] + edges_df = pd.DataFrame(edges).drop_duplicates() + edges_df['type'] = edges_df['label'] + return edges_df + + def gql_to_graph(self, query: str) -> Any: + """ + Executes a query and constructs a graphistry graph from the results. + + Args: + query (str): The GQL query to execute. + + Returns: + Any: A graphistry graph object constructed from the query results. + """ + query_result = self.execute_query(query) + json_data = self.parse_spanner_json(query_result) + nodes_df = self.get_nodes_df(json_data) + edges_df = self.get_edges_df(json_data) + g = self.graphistry.nodes(nodes_df, 'identifier').edges(edges_df, 'source', 'destination') + return g + + def get_schema(self) -> dict: + """ + Retrieves the schema of the Spanner database. + + Returns: + dict: A dictionary containing table names and column details. + """ + schema = {} + try: + cursor = self.connection.cursor() + cursor.execute("SELECT table_name, column_name, spanner_type FROM information_schema.columns") + for row in cursor.fetchall(): + table_name, column_name, spanner_type = row + if table_name not in schema: + schema[table_name] = [] + schema[table_name].append({"column_name": column_name, "type": spanner_type}) + logging.info("Database schema retrieved successfully.") + except Exception as e: + logging.error(f"Failed to retrieve schema: {e}") + return schema + + def validate_data(self, data: dict, schema: dict) -> bool: + """ + Validates input data against the database schema. + + Args: + data (dict): The data to validate. + schema (dict): The schema of the database. + + Returns: + bool: True if the data is valid, False otherwise. + """ + for table, columns in data.items(): + if table not in schema: + logging.error(f"Table {table} does not exist in schema.") + return False + for record in columns: + for key in record.keys(): + if key not in [col["column_name"] for col in schema[table]]: + logging.error(f"Column {key} is not valid for table {table}.") + return False + logging.info("Data validation passed.") + return True + + def dump_config(self) -> dict: + """ + Returns the current configuration of the spannergraph instance. + + Returns: + dict: A dictionary containing configuration details. + """ + return { + "project_id": self + } diff --git a/setup.py b/setup.py index 0dbe7798e9..84cd60a8fa 100755 --- a/setup.py +++ b/setup.py @@ -52,6 +52,7 @@ def unique_flatten_dict(d): 'bolt': ['neo4j', 'neotime'], 'nodexl': ['openpyxl==3.1.0', 'xlrd'], 'jupyter': ['ipython'], + 'spanner': ['google-cloud-spanner'], } base_extras_heavy = { From 5d23ae3a63829ec2773bab8ceae06e5f35745242 Mon Sep 17 00:00:00 2001 From: Thomas Cook Date: Thu, 9 Jan 2025 22:06:42 -0600 Subject: [PATCH 04/33] added lazy imports --- graphistry/plugins/spannergraph.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/graphistry/plugins/spannergraph.py b/graphistry/plugins/spannergraph.py index d4e47b09c5..6cfd3902ab 100644 --- a/graphistry/plugins/spannergraph.py +++ b/graphistry/plugins/spannergraph.py @@ -4,9 +4,6 @@ import time import logging from typing import Any -from google.cloud.spanner_dbapi.connection import connect -from google.cloud.spanner_v1.data_types import JsonObject -from google.cloud.spanner_v1 import KeySet logging.basicConfig(level=logging.INFO) @@ -91,6 +88,7 @@ def __connect(self) -> Any: ConnectionError: If the connection to Spanner fails. """ try: + from google.cloud.spanner_dbapi.connection import connect # Lazy import connection = connect(self.instance_id, self.database_id) connection.autocommit = True logging.info("Connected to Spanner database.") @@ -141,6 +139,7 @@ def parse_spanner_json(query_result: QueryResult) -> list: Returns: list: A list of dictionaries containing nodes and edges. """ + from google.cloud.spanner_v1.data_types import JsonObject # Lazy import data = [ query_result.data ] json_list = [] for record in data: From c5fc0d46f9f100ebdd3704d0b1c837c537db2467 Mon Sep 17 00:00:00 2001 From: Thomas Cook Date: Thu, 9 Jan 2025 22:10:03 -0600 Subject: [PATCH 05/33] removed import from Plotterbase.py --- graphistry/PlotterBase.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/graphistry/PlotterBase.py b/graphistry/PlotterBase.py index 9aed7aeab8..5a70614e12 100644 --- a/graphistry/PlotterBase.py +++ b/graphistry/PlotterBase.py @@ -38,7 +38,7 @@ from .arrow_uploader import ArrowUploader from .nodexlistry import NodeXLGraphistry from .tigeristry import Tigeristry -from .database_clients.spannergraph import spannergraph +# from .database_clients.spannergraph import spannergraph from .util import setup_logger logger = setup_logger(__name__) From 5a5d211e7c6aec79f85e46a986e0c6d7a5302021 Mon Sep 17 00:00:00 2001 From: Thomas Cook Date: Sun, 12 Jan 2025 22:09:12 -0600 Subject: [PATCH 06/33] fix pydocs --- graphistry/plugins/spannergraph.py | 145 +++++++++++------------------ 1 file changed, 55 insertions(+), 90 deletions(-) diff --git a/graphistry/plugins/spannergraph.py b/graphistry/plugins/spannergraph.py index 6cfd3902ab..1a8d471d9c 100644 --- a/graphistry/plugins/spannergraph.py +++ b/graphistry/plugins/spannergraph.py @@ -3,7 +3,7 @@ import json import time import logging -from typing import Any +from typing import Any, List, Dict logging.basicConfig(level=logging.INFO) @@ -11,65 +11,53 @@ class QueryResult: """ Encapsulates the results of a query, including metadata. - Attributes: - data (list): The raw query results. - execution_time (float): The time taken to execute the query. - record_count (int): The number of records returned. + :ivar list data: The raw query results. + :ivar float execution_time: The time taken to execute the query. + :ivar int record_count: The number of records returned. """ - def __init__(self, data: list, execution_time: float): + def __init__(self, data: List[Any], execution_time: float): + """ + Initializes a QueryResult instance. + + :param data: The raw query results. + :param execution_time: The time taken to execute the query. + """ self.data = data self.execution_time = execution_time self.record_count = len(data) - def summary(self) -> dict: + def summary(self) -> Dict[str, Any]: """ Provides a summary of the query execution. - Returns: - dict: A summary of the query results. + :return: A summary of the query results. """ return { "execution_time": self.execution_time, "record_count": self.record_count } + class spannergraph: """ - A comprehensive interface for interacting with Google Spanner Graph databases. + A comprehensive interface for interacting with Google Spanner Graph databases. - This class provides methods for connecting to the database, executing queries, - processing graph data into structured formats, and constructing visualizations - with the Graphistry library. - - Attributes: - project_id (str): The Google Cloud project ID. - instance_id (str): The Spanner instance ID. - database_id (str): The Spanner database ID. - connection: The active connection to the Spanner database. - graphistry: The Graphistry parent object. - - Key Methods: - __connect: Establish a connection to the Spanner database. - execute_query: Run a GQL query against the database. - parse_spanner_json: Convert raw Spanner JSON results into structured data. - get_nodes_df: Extract and structure graph nodes into a pandas DataFrame. - get_edges_df: Extract and structure graph edges into a pandas DataFrame. - gql_to_graph: Combine the above methods to generate a Graphistry visualization. - get_schema: Retrieve the schema of the database. - validate_data: Validate input data for queries or updates. - dump_config: Returns the configuration of the spannergraph instance. + :ivar str project_id: The Google Cloud project ID. + :ivar str instance_id: The Spanner instance ID. + :ivar str database_id: The Spanner database ID. + :ivar Any connection: The active connection to the Spanner database. + :ivar Any graphistry: The Graphistry parent object. """ - def __init__(self, graphistry, project_id: str, instance_id: str, database_id: str): + def __init__(self, graphistry: Any, project_id: str, instance_id: str, database_id: str): """ Initializes the spannergraph instance. - Args: - graphistry: The Graphistry parent object. - project_id (str): The Google Cloud project ID. - instance_id (str): The Spanner instance ID. - database_id (str): The Spanner database ID. + :param graphistry: The Graphistry parent object. + :param project_id: The Google Cloud project ID. + :param instance_id: The Spanner instance ID. + :param database_id: The Spanner database ID. """ self.graphistry = graphistry self.project_id = project_id @@ -79,13 +67,10 @@ def __init__(self, graphistry, project_id: str, instance_id: str, database_id: s def __connect(self) -> Any: """ - Establish a connection to the Spanner database. - - Returns: - Any: A connection object to the Spanner database. + Establishes a connection to the Spanner database. - Raises: - ConnectionError: If the connection to Spanner fails. + :return: A connection object to the Spanner database. + :raises ConnectionError: If the connection to Spanner fails. """ try: from google.cloud.spanner_dbapi.connection import connect # Lazy import @@ -108,14 +93,9 @@ def execute_query(self, query: str) -> QueryResult: """ Executes a GQL query on the Spanner database. - Args: - query (str): The GQL query to execute. - - Returns: - QueryResult: The results of the query execution. - - Raises: - RuntimeError: If the query execution fails. + :param query: The GQL query to execute. + :return: The results of the query execution. + :raises RuntimeError: If the query execution fails. """ try: start_time = time.time() @@ -129,18 +109,15 @@ def execute_query(self, query: str) -> QueryResult: raise RuntimeError(f"Query execution failed: {e}") @staticmethod - def parse_spanner_json(query_result: QueryResult) -> list: + def parse_spanner_json(query_result: QueryResult) -> List[Dict[str, Any]]: """ Converts Spanner JSON graph data into structured Python objects. - Args: - query_result (QueryResult): The results of the executed query. - - Returns: - list: A list of dictionaries containing nodes and edges. + :param query_result: The results of the executed query. + :return: A list of dictionaries containing nodes and edges. """ from google.cloud.spanner_v1.data_types import JsonObject # Lazy import - data = [ query_result.data ] + data = [query_result.data] json_list = [] for record in data: for item in record: @@ -168,15 +145,12 @@ def parse_spanner_json(query_result: QueryResult) -> list: return json_list @staticmethod - def get_nodes_df(json_data: list) -> pd.DataFrame: + def get_nodes_df(json_data: List[Dict[str, Any]]) -> pd.DataFrame: """ Converts graph nodes into a pandas DataFrame. - Args: - json_data (list): The structured JSON data containing graph nodes. - - Returns: - pd.DataFrame: A DataFrame containing node information. + :param json_data: The structured JSON data containing graph nodes. + :return: A DataFrame containing node information. """ nodes = [ {"label": node["label"], "identifier": node["identifier"], **node["properties"]} @@ -188,15 +162,12 @@ def get_nodes_df(json_data: list) -> pd.DataFrame: return nodes_df @staticmethod - def get_edges_df(json_data: list) -> pd.DataFrame: + def get_edges_df(json_data: List[Dict[str, Any]]) -> pd.DataFrame: """ Converts graph edges into a pandas DataFrame. - Args: - json_data (list): The structured JSON data containing graph edges. - - Returns: - pd.DataFrame: A DataFrame containing edge information. + :param json_data: The structured JSON data containing graph edges. + :return: A DataFrame containing edge information. """ edges = [ { @@ -215,13 +186,10 @@ def get_edges_df(json_data: list) -> pd.DataFrame: def gql_to_graph(self, query: str) -> Any: """ - Executes a query and constructs a graphistry graph from the results. - - Args: - query (str): The GQL query to execute. + Executes a query and constructs a Graphistry graph from the results. - Returns: - Any: A graphistry graph object constructed from the query results. + :param query: The GQL query to execute. + :return: A Graphistry graph object constructed from the query results. """ query_result = self.execute_query(query) json_data = self.parse_spanner_json(query_result) @@ -230,12 +198,11 @@ def gql_to_graph(self, query: str) -> Any: g = self.graphistry.nodes(nodes_df, 'identifier').edges(edges_df, 'source', 'destination') return g - def get_schema(self) -> dict: + def get_schema(self) -> Dict[str, List[Dict[str, str]]]: """ Retrieves the schema of the Spanner database. - Returns: - dict: A dictionary containing table names and column details. + :return: A dictionary containing table names and column details. """ schema = {} try: @@ -251,16 +218,13 @@ def get_schema(self) -> dict: logging.error(f"Failed to retrieve schema: {e}") return schema - def validate_data(self, data: dict, schema: dict) -> bool: + def validate_data(self, data: Dict[str, List[Dict[str, Any]]], schema: Dict[str, List[Dict[str, str]]]) -> bool: """ Validates input data against the database schema. - Args: - data (dict): The data to validate. - schema (dict): The schema of the database. - - Returns: - bool: True if the data is valid, False otherwise. + :param data: The data to validate. + :param schema: The schema of the database. + :return: True if the data is valid, False otherwise. """ for table, columns in data.items(): if table not in schema: @@ -274,13 +238,14 @@ def validate_data(self, data: dict, schema: dict) -> bool: logging.info("Data validation passed.") return True - def dump_config(self) -> dict: + def dump_config(self) -> Dict[str, str]: """ Returns the current configuration of the spannergraph instance. - Returns: - dict: A dictionary containing configuration details. + :return: A dictionary containing configuration details. """ return { - "project_id": self + "project_id": self.project_id, + "instance_id": self.instance_id, + "database_id": self.database_id } From af75ce9aa8c54fdd746581b534e199a79f0d994e Mon Sep 17 00:00:00 2001 From: Thomas Cook Date: Mon, 13 Jan 2025 11:10:45 -0600 Subject: [PATCH 07/33] remove database_clients dir, moved to plugins/ --- graphistry/database_clients/spannergraph.py | 287 -------------------- 1 file changed, 287 deletions(-) delete mode 100644 graphistry/database_clients/spannergraph.py diff --git a/graphistry/database_clients/spannergraph.py b/graphistry/database_clients/spannergraph.py deleted file mode 100644 index d4e47b09c5..0000000000 --- a/graphistry/database_clients/spannergraph.py +++ /dev/null @@ -1,287 +0,0 @@ -import os -import pandas as pd -import json -import time -import logging -from typing import Any -from google.cloud.spanner_dbapi.connection import connect -from google.cloud.spanner_v1.data_types import JsonObject -from google.cloud.spanner_v1 import KeySet - -logging.basicConfig(level=logging.INFO) - -class QueryResult: - """ - Encapsulates the results of a query, including metadata. - - Attributes: - data (list): The raw query results. - execution_time (float): The time taken to execute the query. - record_count (int): The number of records returned. - """ - - def __init__(self, data: list, execution_time: float): - self.data = data - self.execution_time = execution_time - self.record_count = len(data) - - def summary(self) -> dict: - """ - Provides a summary of the query execution. - - Returns: - dict: A summary of the query results. - """ - return { - "execution_time": self.execution_time, - "record_count": self.record_count - } - -class spannergraph: - """ - A comprehensive interface for interacting with Google Spanner Graph databases. - - This class provides methods for connecting to the database, executing queries, - processing graph data into structured formats, and constructing visualizations - with the Graphistry library. - - Attributes: - project_id (str): The Google Cloud project ID. - instance_id (str): The Spanner instance ID. - database_id (str): The Spanner database ID. - connection: The active connection to the Spanner database. - graphistry: The Graphistry parent object. - - Key Methods: - __connect: Establish a connection to the Spanner database. - execute_query: Run a GQL query against the database. - parse_spanner_json: Convert raw Spanner JSON results into structured data. - get_nodes_df: Extract and structure graph nodes into a pandas DataFrame. - get_edges_df: Extract and structure graph edges into a pandas DataFrame. - gql_to_graph: Combine the above methods to generate a Graphistry visualization. - get_schema: Retrieve the schema of the database. - validate_data: Validate input data for queries or updates. - dump_config: Returns the configuration of the spannergraph instance. - """ - - def __init__(self, graphistry, project_id: str, instance_id: str, database_id: str): - """ - Initializes the spannergraph instance. - - Args: - graphistry: The Graphistry parent object. - project_id (str): The Google Cloud project ID. - instance_id (str): The Spanner instance ID. - database_id (str): The Spanner database ID. - """ - self.graphistry = graphistry - self.project_id = project_id - self.instance_id = instance_id - self.database_id = database_id - self.connection = self.__connect() - - def __connect(self) -> Any: - """ - Establish a connection to the Spanner database. - - Returns: - Any: A connection object to the Spanner database. - - Raises: - ConnectionError: If the connection to Spanner fails. - """ - try: - connection = connect(self.instance_id, self.database_id) - connection.autocommit = True - logging.info("Connected to Spanner database.") - return connection - except Exception as e: - raise ConnectionError(f"Failed to connect to Spanner: {e}") - - def close_connection(self) -> None: - """ - Closes the connection to the Spanner database. - """ - if self.connection: - self.connection.close() - logging.info("Connection to Spanner database closed.") - - def execute_query(self, query: str) -> QueryResult: - """ - Executes a GQL query on the Spanner database. - - Args: - query (str): The GQL query to execute. - - Returns: - QueryResult: The results of the query execution. - - Raises: - RuntimeError: If the query execution fails. - """ - try: - start_time = time.time() - cursor = self.connection.cursor() - cursor.execute(query) - results = cursor.fetchall() - execution_time = time.time() - start_time - logging.info(f"Query executed in {execution_time:.4f} seconds.") - return QueryResult(results, execution_time) - except Exception as e: - raise RuntimeError(f"Query execution failed: {e}") - - @staticmethod - def parse_spanner_json(query_result: QueryResult) -> list: - """ - Converts Spanner JSON graph data into structured Python objects. - - Args: - query_result (QueryResult): The results of the executed query. - - Returns: - list: A list of dictionaries containing nodes and edges. - """ - data = [ query_result.data ] - json_list = [] - for record in data: - for item in record: - json_entry = {"nodes": [], "edges": []} - elements = json.loads(item.serialize()) if isinstance(item, JsonObject) else item - for element in elements: - if element.get('kind') == 'node': - for label in element.get('labels', []): - json_entry["nodes"].append({ - "label": label, - "identifier": element.get('identifier'), - "properties": element.get('properties', {}) - }) - elif element.get('kind') == 'edge': - for label in element.get('labels', []): - json_entry["edges"].append({ - "label": label, - "identifier": element.get('identifier'), - "source": element.get('source_node_identifier'), - "destination": element.get('destination_node_identifier'), - "properties": element.get('properties', {}) - }) - if json_entry["nodes"] or json_entry["edges"]: - json_list.append(json_entry) - return json_list - - @staticmethod - def get_nodes_df(json_data: list) -> pd.DataFrame: - """ - Converts graph nodes into a pandas DataFrame. - - Args: - json_data (list): The structured JSON data containing graph nodes. - - Returns: - pd.DataFrame: A DataFrame containing node information. - """ - nodes = [ - {"label": node["label"], "identifier": node["identifier"], **node["properties"]} - for entry in json_data - for node in entry["nodes"] - ] - nodes_df = pd.DataFrame(nodes).drop_duplicates() - nodes_df['type'] = nodes_df['label'] - return nodes_df - - @staticmethod - def get_edges_df(json_data: list) -> pd.DataFrame: - """ - Converts graph edges into a pandas DataFrame. - - Args: - json_data (list): The structured JSON data containing graph edges. - - Returns: - pd.DataFrame: A DataFrame containing edge information. - """ - edges = [ - { - "label": edge["label"], - "identifier": edge["identifier"], - "source": edge["source"], - "destination": edge["destination"], - **edge["properties"] - } - for entry in json_data - for edge in entry["edges"] - ] - edges_df = pd.DataFrame(edges).drop_duplicates() - edges_df['type'] = edges_df['label'] - return edges_df - - def gql_to_graph(self, query: str) -> Any: - """ - Executes a query and constructs a graphistry graph from the results. - - Args: - query (str): The GQL query to execute. - - Returns: - Any: A graphistry graph object constructed from the query results. - """ - query_result = self.execute_query(query) - json_data = self.parse_spanner_json(query_result) - nodes_df = self.get_nodes_df(json_data) - edges_df = self.get_edges_df(json_data) - g = self.graphistry.nodes(nodes_df, 'identifier').edges(edges_df, 'source', 'destination') - return g - - def get_schema(self) -> dict: - """ - Retrieves the schema of the Spanner database. - - Returns: - dict: A dictionary containing table names and column details. - """ - schema = {} - try: - cursor = self.connection.cursor() - cursor.execute("SELECT table_name, column_name, spanner_type FROM information_schema.columns") - for row in cursor.fetchall(): - table_name, column_name, spanner_type = row - if table_name not in schema: - schema[table_name] = [] - schema[table_name].append({"column_name": column_name, "type": spanner_type}) - logging.info("Database schema retrieved successfully.") - except Exception as e: - logging.error(f"Failed to retrieve schema: {e}") - return schema - - def validate_data(self, data: dict, schema: dict) -> bool: - """ - Validates input data against the database schema. - - Args: - data (dict): The data to validate. - schema (dict): The schema of the database. - - Returns: - bool: True if the data is valid, False otherwise. - """ - for table, columns in data.items(): - if table not in schema: - logging.error(f"Table {table} does not exist in schema.") - return False - for record in columns: - for key in record.keys(): - if key not in [col["column_name"] for col in schema[table]]: - logging.error(f"Column {key} is not valid for table {table}.") - return False - logging.info("Data validation passed.") - return True - - def dump_config(self) -> dict: - """ - Returns the current configuration of the spannergraph instance. - - Returns: - dict: A dictionary containing configuration details. - """ - return { - "project_id": self - } From ab3e5e721dba9b6f70e11c235ec38d36769f9697 Mon Sep 17 00:00:00 2001 From: Thomas Cook Date: Tue, 14 Jan 2025 18:02:07 -0600 Subject: [PATCH 08/33] various changes to pass in spanner config to register() --- graphistry/PlotterBase.py | 22 +++++++++- graphistry/plugins/spannergraph.py | 22 +++++----- graphistry/pygraphistry.py | 64 +++++++++++++++++++++++------- 3 files changed, 81 insertions(+), 27 deletions(-) diff --git a/graphistry/PlotterBase.py b/graphistry/PlotterBase.py index 5a70614e12..0db98f9337 100644 --- a/graphistry/PlotterBase.py +++ b/graphistry/PlotterBase.py @@ -35,10 +35,11 @@ end_node_id_key, to_bolt_driver) + from .arrow_uploader import ArrowUploader from .nodexlistry import NodeXLGraphistry from .tigeristry import Tigeristry -# from .database_clients.spannergraph import spannergraph +from .plugins import SpannerGraph from .util import setup_logger logger = setup_logger(__name__) @@ -2272,6 +2273,16 @@ def bolt(self, driver): res._bolt_driver = to_bolt_driver(driver) return res + def spanner_init(self, spanner_config): + res = copy.copy(self) + + project_id = spanner_config["project_id"] + instance_id = spanner_config["instance_id"] + database_id = spanner_config["database_id"] + # TODO(tcook): throw an exception when any are missing? + + res._spannergraph = SpannerGraph(res, project_id, instance_id, database_id) + return res def infer_labels(self): """ @@ -2460,6 +2471,15 @@ def cypher(self, query: str, params: Dict[str, Any] = {}) -> Plottable: )\ .nodes(nodes)\ .edges(edges) + + + def spanner_query(self, query: str, params: Dict[str, Any] = {}) -> Plottable: + from .pygraphistry import PyGraphistry + + res = copy.copy(self) + + return res._spannergraph.gql_to_graph(query) + def nodexl(self, xls_or_url, source='default', engine=None, verbose=False): diff --git a/graphistry/plugins/spannergraph.py b/graphistry/plugins/spannergraph.py index 1a8d471d9c..ab807c1624 100644 --- a/graphistry/plugins/spannergraph.py +++ b/graphistry/plugins/spannergraph.py @@ -5,9 +5,9 @@ import logging from typing import Any, List, Dict -logging.basicConfig(level=logging.INFO) +# logging.basicConfig(level=logging.INFO) -class QueryResult: +class SpannerQueryResult: """ Encapsulates the results of a query, including metadata. @@ -18,7 +18,7 @@ class QueryResult: def __init__(self, data: List[Any], execution_time: float): """ - Initializes a QueryResult instance. + Initializes a SpannerQueryResult instance. :param data: The raw query results. :param execution_time: The time taken to execute the query. @@ -39,7 +39,7 @@ def summary(self) -> Dict[str, Any]: } -class spannergraph: +class SpannerGraph: """ A comprehensive interface for interacting with Google Spanner Graph databases. @@ -52,7 +52,7 @@ class spannergraph: def __init__(self, graphistry: Any, project_id: str, instance_id: str, database_id: str): """ - Initializes the spannergraph instance. + Initializes the SpannerGraph instance. :param graphistry: The Graphistry parent object. :param project_id: The Google Cloud project ID. @@ -70,7 +70,7 @@ def __connect(self) -> Any: Establishes a connection to the Spanner database. :return: A connection object to the Spanner database. - :raises ConnectionError: If the connection to Spanner fails. + :raises SpannerConnectionError: If the connection to Spanner fails. """ try: from google.cloud.spanner_dbapi.connection import connect # Lazy import @@ -79,7 +79,7 @@ def __connect(self) -> Any: logging.info("Connected to Spanner database.") return connection except Exception as e: - raise ConnectionError(f"Failed to connect to Spanner: {e}") + raise SpannerConnectionError(f"Failed to connect to Spanner: {e}") def close_connection(self) -> None: """ @@ -89,7 +89,7 @@ def close_connection(self) -> None: self.connection.close() logging.info("Connection to Spanner database closed.") - def execute_query(self, query: str) -> QueryResult: + def execute_query(self, query: str) -> SpannerQueryResult: """ Executes a GQL query on the Spanner database. @@ -104,12 +104,12 @@ def execute_query(self, query: str) -> QueryResult: results = cursor.fetchall() execution_time = time.time() - start_time logging.info(f"Query executed in {execution_time:.4f} seconds.") - return QueryResult(results, execution_time) + return SpannerQueryResult(results, execution_time) except Exception as e: raise RuntimeError(f"Query execution failed: {e}") @staticmethod - def parse_spanner_json(query_result: QueryResult) -> List[Dict[str, Any]]: + def parse_spanner_json(query_result: SpannerQueryResult) -> List[Dict[str, Any]]: """ Converts Spanner JSON graph data into structured Python objects. @@ -240,7 +240,7 @@ def validate_data(self, data: Dict[str, List[Dict[str, Any]]], schema: Dict[str, def dump_config(self) -> Dict[str, str]: """ - Returns the current configuration of the spannergraph instance. + Returns the current configuration of the SpannerGraph instance. :return: A dictionary containing configuration details. """ diff --git a/graphistry/pygraphistry.py b/graphistry/pygraphistry.py index bf7521f99d..546b98a584 100644 --- a/graphistry/pygraphistry.py +++ b/graphistry/pygraphistry.py @@ -570,6 +570,10 @@ def certificate_validation(value=None): def set_bolt_driver(driver=None): PyGraphistry._config["bolt_driver"] = bolt_util.to_bolt_driver(driver) + @staticmethod + def set_spanner_config(spanner_config=None): + PyGraphistry._config["spanner"] = spanner_config # TODO(tcook): bolt_util.to_bolt_driver(driver) + @staticmethod def register( key: Optional[str] = None, @@ -583,6 +587,7 @@ def register( api: Optional[Literal[1, 3]] = None, certificate_validation: Optional[bool] = None, bolt: Optional[Union[Dict, Any]] = None, + spanner_config: Optional[Union[Dict, Any]] = None, token_refresh_ms: int = 10 * 60 * 1000, store_token_creds_in_memory: Optional[bool] = None, client_protocol_hostname: Optional[str] = None, @@ -620,6 +625,8 @@ def register( :type certificate_validation: Optional[bool] :param bolt: Neo4j bolt information. Optional driver or named constructor arguments for instantiating a new one. :type bolt: Union[dict, Any] + :param spanner_config: Spanner connection information. Named constructor arguments for instantiating a spanner client + :type spanner_config: Union[dict, Any] :param protocol: Protocol used to contact visualization server, defaults to "https". :type protocol: Optional[str] :param token_refresh_ms: Ignored for now; JWT token auto-refreshed on plot() calls. @@ -705,6 +712,8 @@ def register( PyGraphistry.certificate_validation(certificate_validation) PyGraphistry.store_token_creds_in_memory(store_token_creds_in_memory) PyGraphistry.set_bolt_driver(bolt) + PyGraphistry.set_spanner_config(spanner_config) + Pygraphistry.spanner_init(spanner_config) # Reset token creds PyGraphistry.__reset_token_creds_in_memory() @@ -1030,6 +1039,31 @@ def bolt(driver=None): """ return Plotter().bolt(driver) + @staticmethod + def spanner_init(spanner_config=None): + """ + + TODO(tcook): fix pydocs + :param spanner_config: a dict of project_id, instance_id and database_id for spanner connection + :return: Plotter w/spanner connection + + Call this to create a Plotter with a Spanner Graph Connection + + **Example** + + :: + + import graphistry + spanner_CONF = { project_id: "my_project", instance_id: "my_instance", database_id: "my_database"} + g = graphistry.spanner_init(spanner_CONF) + + """ + if spanner_config is None: + return None + else: + return Plotter().spanner_init(spanner_config) + + @staticmethod def cypher(query, params={}): """ @@ -1824,21 +1858,21 @@ def tigergraph( protocol, server, web_port, api_port, db, user, pwd, verbose ) - - @staticmethod - def spannergraph(project_id, instance_id, database_id): - """ - Create a new PlotterBase instance with SpannerGraph configured. - - Args: - project_id (str): Google Cloud project ID. - instance_id (str): Spanner instance ID. - database_id (str): Spanner database ID. - - Returns: - PlotterBase: A PlotterBase instance configured with SpannerGraph. - """ - return Plotter().spannergraph(project_id, instance_id, database_id) + # tcook - is this needed? or we use spanner_init? + # @staticmethod + # def spannergraph(project_id, instance_id, database_id): + # """ + # Create a new PlotterBase instance with SpannerGraph configured. + + # Args: + # project_id (str): Google Cloud project ID. + # instance_id (str): Spanner instance ID. + # database_id (str): Spanner database ID. + + # Returns: + # PlotterBase: A PlotterBase instance configured with SpannerGraph. + # """ + # return Plotter().spannergraph(project_id, instance_id, database_id) @staticmethod From d2f6b571d8881f94d8e21992aec79c4c31e55297 Mon Sep 17 00:00:00 2001 From: Thomas Cook Date: Wed, 15 Jan 2025 18:01:03 -0600 Subject: [PATCH 09/33] added debugging --- graphistry/PlotterBase.py | 27 ++++++++++++++++++- graphistry/__init__.py | 1 + graphistry/pygraphistry.py | 54 ++++++++++++++++++++++++-------------- 3 files changed, 61 insertions(+), 21 deletions(-) diff --git a/graphistry/PlotterBase.py b/graphistry/PlotterBase.py index 0db98f9337..c2538eafbc 100644 --- a/graphistry/PlotterBase.py +++ b/graphistry/PlotterBase.py @@ -39,7 +39,7 @@ from .arrow_uploader import ArrowUploader from .nodexlistry import NodeXLGraphistry from .tigeristry import Tigeristry -from .plugins import SpannerGraph +from .plugins.spannergraph import SpannerGraph from .util import setup_logger logger = setup_logger(__name__) @@ -2282,6 +2282,7 @@ def spanner_init(self, spanner_config): # TODO(tcook): throw an exception when any are missing? res._spannergraph = SpannerGraph(res, project_id, instance_id, database_id) + print(f'DEBUG: created SpannerGraph object: {res._spannergraph}') return res def infer_labels(self): @@ -2474,6 +2475,30 @@ def cypher(self, query: str, params: Dict[str, Any] = {}) -> Plottable: def spanner_query(self, query: str, params: Dict[str, Any] = {}) -> Plottable: + """ + TODO(tcook): maybe rename to spanner_query_gql since spanner supports multiple languages. SQL, GQL, etc + + query google spanner graph database and return Plottable with nodes and edges populated + :param query: GQL query string + :type query: Str + :returns: Plottable + :rtype: Plottable + + **Example: calling spanner_query + :: + + import graphistry + + SPANNER_CONF = { "project_id": PROJECT_ID, + "instance_id": INSTANCE_ID, + "database_id": DATABASE_ID } + + graphistry.register(..., spanner_config=SPANNER_CONF) + + g = graphistry.spanner_query("Graph MyGraph\nMATCH ()-[]->()" ) + + """ + from .pygraphistry import PyGraphistry res = copy.copy(self) diff --git a/graphistry/__init__.py b/graphistry/__init__.py index befef2c1a2..5ff53900b5 100644 --- a/graphistry/__init__.py +++ b/graphistry/__init__.py @@ -31,6 +31,7 @@ bolt, cypher, tigergraph, + spanner_query, gsql, gsql_endpoint, cosmos, diff --git a/graphistry/pygraphistry.py b/graphistry/pygraphistry.py index 546b98a584..cef620d3c4 100644 --- a/graphistry/pygraphistry.py +++ b/graphistry/pygraphistry.py @@ -571,8 +571,33 @@ def set_bolt_driver(driver=None): PyGraphistry._config["bolt_driver"] = bolt_util.to_bolt_driver(driver) @staticmethod - def set_spanner_config(spanner_config=None): - PyGraphistry._config["spanner"] = spanner_config # TODO(tcook): bolt_util.to_bolt_driver(driver) + # def set_spanner_config(spanner_config: Optional[Union[Dict, Any] = None): + def set_spanner_config(spanner_config): + """ + Saves the spanner config to internal Pygraphistry _config + :param spanner_config: dict of the project_id, instance_id and database_id + :type spanner_config: Optional[Union[Dict, Any]] + :returns: None. + :rtype: None + + **Example: calling set_spanner_config** + :: + + import graphistry + graphistry.register(...) + + SPANNER_CONF = { "project_id": PROJECT_ID, + "instance_id": INSTANCE_ID, + "database_id": DATABASE_ID } + + graphistry.set_spanner_config(SPANNER_CONF) + + """ + + if spanner_config is not None: + PyGraphistry._config["spanner"] = spanner_config + + @staticmethod def register( @@ -713,7 +738,7 @@ def register( PyGraphistry.store_token_creds_in_memory(store_token_creds_in_memory) PyGraphistry.set_bolt_driver(bolt) PyGraphistry.set_spanner_config(spanner_config) - Pygraphistry.spanner_init(spanner_config) + PyGraphistry.spanner_init(spanner_config) # Reset token creds PyGraphistry.__reset_token_creds_in_memory() @@ -1043,7 +1068,7 @@ def bolt(driver=None): def spanner_init(spanner_config=None): """ - TODO(tcook): fix pydocs + TODO(tcook): update pydocs :param spanner_config: a dict of project_id, instance_id and database_id for spanner connection :return: Plotter w/spanner connection @@ -1858,21 +1883,10 @@ def tigergraph( protocol, server, web_port, api_port, db, user, pwd, verbose ) - # tcook - is this needed? or we use spanner_init? - # @staticmethod - # def spannergraph(project_id, instance_id, database_id): - # """ - # Create a new PlotterBase instance with SpannerGraph configured. - - # Args: - # project_id (str): Google Cloud project ID. - # instance_id (str): Spanner instance ID. - # database_id (str): Spanner database ID. - - # Returns: - # PlotterBase: A PlotterBase instance configured with SpannerGraph. - # """ - # return Plotter().spannergraph(project_id, instance_id, database_id) + @staticmethod + def spanner_query(query: str, params: Dict[str, Any] = {}) -> Plottable: + # TODO(tcook): add pydocs + return Plotter().spanner_query(query, params) @staticmethod @@ -2536,7 +2550,7 @@ def _handle_api_response(response): cypher = PyGraphistry.cypher nodexl = PyGraphistry.nodexl tigergraph = PyGraphistry.tigergraph -spannergraph = PyGraphistry.spannergraph +spanner_query = PyGraphistry.spanner_query cosmos = PyGraphistry.cosmos neptune = PyGraphistry.neptune gremlin = PyGraphistry.gremlin From 6c7324abeff72615980b2dd7257f29f8d2e3bf9c Mon Sep 17 00:00:00 2001 From: Thomas Cook Date: Wed, 15 Jan 2025 18:48:03 -0600 Subject: [PATCH 10/33] added demo notebook, debug output --- .../google_spanner_finance_graph.ipynb | 139 ++++++++++++++++++ graphistry/PlotterBase.py | 19 ++- graphistry/plugins/spannergraph.py | 5 + 3 files changed, 160 insertions(+), 3 deletions(-) create mode 100644 demos/demos_databases_apis/spanner/google_spanner_finance_graph.ipynb diff --git a/demos/demos_databases_apis/spanner/google_spanner_finance_graph.ipynb b/demos/demos_databases_apis/spanner/google_spanner_finance_graph.ipynb new file mode 100644 index 0000000000..69690360e8 --- /dev/null +++ b/demos/demos_databases_apis/spanner/google_spanner_finance_graph.ipynb @@ -0,0 +1,139 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "9418e0e1-648d-4cd7-a264-9bf910cbcfe3", + "metadata": {}, + "outputs": [], + "source": [ + "import graphistry\n", + "graphistry.__version__" + ] + }, + { + "cell_type": "markdown", + "id": "1407509d-f079-4f66-bc5f-bd93c9510d09", + "metadata": {}, + "source": [ + "#### Settings " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "02515636-e226-429a-bac5-6bb61ef69ead", + "metadata": {}, + "outputs": [], + "source": [ + "PROJECT_ID = \"graphistrycloud-dev\" \n", + "INSTANCE_ID = \"tcook-test-90-day-trial\" \n", + "DATABASE_ID = \"finance-graph-db\" \n", + "LIMIT = \"limit 100\" \n", + "\n", + "SPANNER_CONF = { \"project_id\": PROJECT_ID, \n", + " \"instance_id\": INSTANCE_ID, \n", + " \"database_id\": DATABASE_ID }\n", + "\n", + "print(SPANNER_CONF)" + ] + }, + { + "cell_type": "markdown", + "id": "97649200-c7ab-4041-bb19-f7ab6363ead3", + "metadata": {}, + "source": [ + "#### gcloud init " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cd1c8aaf-d927-4989-aab2-3a5d0e8858ca", + "metadata": {}, + "outputs": [], + "source": [ + "# Set the project id\n", + "!gcloud config set project {PROJECT_ID}\n", + "%env GOOGLE_CLOUD_PROJECT={PROJECT_ID}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4b42c055-2f60-4ac8-975d-f005e37afec0", + "metadata": {}, + "outputs": [], + "source": [ + "!gcloud auth application-default login" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a8ad0f91-19b6-4407-9c19-dcbf329354fb", + "metadata": {}, + "outputs": [], + "source": [ + "# register \n", + "\n", + "import os\n", + "os.environ[\"GRAPHISTRY_SERVER\"] = 'hub.graphistry.com'\n", + "\n", + "print(f'server = {os.getenv(\"GRAPHISTRY_SERVER\")}')\n", + "print(f'protocol = {os.getenv(\"GRAPHISTRY_PROTOCOL\")}')\n", + "print(f'username = {os.getenv(\"GRAPHISTRY_USERNAME\")}')\n", + "\n", + "graphistry.register(api=3, \n", + " protocol = \"https\", \n", + " server = os.getenv(\"GRAPHISTRY_SERVER\"),\n", + " username = os.getenv(\"GRAPHISTRY_USERNAME\"), \n", + " password = os.getenv(\"GRAPHISTRY_PASSWORD\"),\n", + " spanner_config=SPANNER_CONF\n", + " ) \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e7ab1379-14bf-4c03-b5b2-28df22609029", + "metadata": {}, + "outputs": [], + "source": [ + "query=f'''GRAPH FinGraph\n", + "MATCH p = (a)-[b]->(c) where 1=1 {LIMIT} return TO_JSON(p) as path'''\n", + "\n", + "g = graphistry.spanner_query(query)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aeead725-928a-44fe-b5b5-630c830502e0", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv - spanner-test-5", + "language": "python", + "name": "spanner-test-5" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/graphistry/PlotterBase.py b/graphistry/PlotterBase.py index c2538eafbc..02a2da1c09 100644 --- a/graphistry/PlotterBase.py +++ b/graphistry/PlotterBase.py @@ -2282,7 +2282,7 @@ def spanner_init(self, spanner_config): # TODO(tcook): throw an exception when any are missing? res._spannergraph = SpannerGraph(res, project_id, instance_id, database_id) - print(f'DEBUG: created SpannerGraph object: {res._spannergraph}') + print(f'DEBUG: created SpannerGraph object: {res._spannergraph} type(res): {type(res)}') return res def infer_labels(self): @@ -2499,11 +2499,24 @@ def spanner_query(self, query: str, params: Dict[str, Any] = {}) -> Plottable: """ + # is this needed? from .pygraphistry import PyGraphistry + print(f'DEBUG: PlotterBase.py spanner_query()') res = copy.copy(self) - - return res._spannergraph.gql_to_graph(query) + + if res._spannergraph is None: + spanner_config = PyGraphistry._config["spanner"] + if spanner_config is not None: + print(f'DEBUG: Spanner Config: {spanner_config}') + else: + print(f'DEBUG: PyGraphistry._config["spanner"] is None') + + res._spannergraph = res.spanner_init(PyGraphistry._config["spanner"]) + return res._spannergraph.gql_to_graph(query) + else: + print(f'DEBUG: res._spannergraph is NOT None') + return res._spannergraph.gql_to_graph(query) def nodexl(self, xls_or_url, source='default', engine=None, verbose=False): diff --git a/graphistry/plugins/spannergraph.py b/graphistry/plugins/spannergraph.py index ab807c1624..1bf889dd3b 100644 --- a/graphistry/plugins/spannergraph.py +++ b/graphistry/plugins/spannergraph.py @@ -26,6 +26,7 @@ def __init__(self, data: List[Any], execution_time: float): self.data = data self.execution_time = execution_time self.record_count = len(data) + print('DEBUG: SpannerQueryResults init()') def summary(self) -> Dict[str, Any]: """ @@ -63,7 +64,9 @@ def __init__(self, graphistry: Any, project_id: str, instance_id: str, database_ self.project_id = project_id self.instance_id = instance_id self.database_id = database_id + print('DEBUG: SpannerGraph init()') self.connection = self.__connect() + print(f'DEBUG: SpannerQueryResults connection = {self.connection}') def __connect(self) -> Any: """ @@ -97,6 +100,8 @@ def execute_query(self, query: str) -> SpannerQueryResult: :return: The results of the query execution. :raises RuntimeError: If the query execution fails. """ + print(f'DEBUG: SpannerGraph execute_query() query:{query}') + try: start_time = time.time() cursor = self.connection.cursor() From 4e7682059829ba93c85d46a3e5dccb019c1addc2 Mon Sep 17 00:00:00 2001 From: Thomas Cook Date: Wed, 15 Jan 2025 19:21:19 -0600 Subject: [PATCH 11/33] minor changes --- graphistry/PlotterBase.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/graphistry/PlotterBase.py b/graphistry/PlotterBase.py index 02a2da1c09..3273e000e4 100644 --- a/graphistry/PlotterBase.py +++ b/graphistry/PlotterBase.py @@ -2283,7 +2283,7 @@ def spanner_init(self, spanner_config): res._spannergraph = SpannerGraph(res, project_id, instance_id, database_id) print(f'DEBUG: created SpannerGraph object: {res._spannergraph} type(res): {type(res)}') - return res + return res def infer_labels(self): """ @@ -2512,7 +2512,7 @@ def spanner_query(self, query: str, params: Dict[str, Any] = {}) -> Plottable: else: print(f'DEBUG: PyGraphistry._config["spanner"] is None') - res._spannergraph = res.spanner_init(PyGraphistry._config["spanner"]) + res = res.spanner_init(PyGraphistry._config["spanner"]) return res._spannergraph.gql_to_graph(query) else: print(f'DEBUG: res._spannergraph is NOT None') From e00001838e82ec0dab094f80a2724b56497f8017 Mon Sep 17 00:00:00 2001 From: Thomas Cook Date: Thu, 16 Jan 2025 00:58:07 -0600 Subject: [PATCH 12/33] various changes for error handling, imports and pydocs --- .../google_spanner_finance_graph.ipynb | 130 +++++++++++++---- graphistry/PlotterBase.py | 24 +-- graphistry/plugins/spannergraph.py | 138 +++++++++++------- graphistry/pygraphistry.py | 1 - 4 files changed, 201 insertions(+), 92 deletions(-) diff --git a/demos/demos_databases_apis/spanner/google_spanner_finance_graph.ipynb b/demos/demos_databases_apis/spanner/google_spanner_finance_graph.ipynb index 69690360e8..e2d4a42713 100644 --- a/demos/demos_databases_apis/spanner/google_spanner_finance_graph.ipynb +++ b/demos/demos_databases_apis/spanner/google_spanner_finance_graph.ipynb @@ -8,6 +8,7 @@ "outputs": [], "source": [ "import graphistry\n", + "import json\n", "graphistry.__version__" ] }, @@ -16,7 +17,7 @@ "id": "1407509d-f079-4f66-bc5f-bd93c9510d09", "metadata": {}, "source": [ - "#### Settings " + "### settings" ] }, { @@ -26,16 +27,20 @@ "metadata": {}, "outputs": [], "source": [ - "PROJECT_ID = \"graphistrycloud-dev\" \n", - "INSTANCE_ID = \"tcook-test-90-day-trial\" \n", - "DATABASE_ID = \"finance-graph-db\" \n", - "LIMIT = \"limit 100\" \n", + "PROJECT_ID = \"my_project\" \n", + "INSTANCE_ID = \"my_instance\" \n", + "DATABASE_ID = \"finance-graph-db\" \n", + "\n", + "# optional setting to limit the number of records returned\n", + "LIMIT_CLAUSE = \"\"\n", + "# or use: \n", + "# LIMIT_CLAUSE = \"limit 1000\" \n", "\n", "SPANNER_CONF = { \"project_id\": PROJECT_ID, \n", " \"instance_id\": INSTANCE_ID, \n", " \"database_id\": DATABASE_ID }\n", "\n", - "print(SPANNER_CONF)" + "# print(json.dumps(SPANNER_CONF, indent=4))" ] }, { @@ -43,7 +48,21 @@ "id": "97649200-c7ab-4041-bb19-f7ab6363ead3", "metadata": {}, "source": [ - "#### gcloud init " + "### graphistry register and gcloud init" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a8ad0f91-19b6-4407-9c19-dcbf329354fb", + "metadata": {}, + "outputs": [], + "source": [ + "# graphistry register \n", + "\n", + "# To specify Graphistry account & server, use:\n", + "# graphistry.register(api=3, username='...', password='...', protocol='https', server='hub.graphistry.com')\n", + "# For more options, see https://pygraphistry.readthedocs.io/en/latest/server/register.html\n" ] }, { @@ -53,7 +72,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Set the project id\n", + "# Set the google project id\n", "!gcloud config set project {PROJECT_ID}\n", "%env GOOGLE_CLOUD_PROJECT={PROJECT_ID}" ] @@ -65,32 +84,15 @@ "metadata": {}, "outputs": [], "source": [ - "!gcloud auth application-default login" + "#!gcloud auth application-default login" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "a8ad0f91-19b6-4407-9c19-dcbf329354fb", + "cell_type": "markdown", + "id": "56bc01a7-76ea-44f6-b1cc-c5488c5c5922", "metadata": {}, - "outputs": [], "source": [ - "# register \n", - "\n", - "import os\n", - "os.environ[\"GRAPHISTRY_SERVER\"] = 'hub.graphistry.com'\n", - "\n", - "print(f'server = {os.getenv(\"GRAPHISTRY_SERVER\")}')\n", - "print(f'protocol = {os.getenv(\"GRAPHISTRY_PROTOCOL\")}')\n", - "print(f'username = {os.getenv(\"GRAPHISTRY_USERNAME\")}')\n", - "\n", - "graphistry.register(api=3, \n", - " protocol = \"https\", \n", - " server = os.getenv(\"GRAPHISTRY_SERVER\"),\n", - " username = os.getenv(\"GRAPHISTRY_USERNAME\"), \n", - " password = os.getenv(\"GRAPHISTRY_PASSWORD\"),\n", - " spanner_config=SPANNER_CONF\n", - " ) \n" + "### Spanner GQL Query to Graphistry Visualization" ] }, { @@ -101,7 +103,7 @@ "outputs": [], "source": [ "query=f'''GRAPH FinGraph\n", - "MATCH p = (a)-[b]->(c) where 1=1 {LIMIT} return TO_JSON(p) as path'''\n", + "MATCH p = (a)-[b]->(c) where 1=1 {LIMIT_CLAUSE} return TO_JSON(p) as path'''\n", "\n", "g = graphistry.spanner_query(query)" ] @@ -112,14 +114,78 @@ "id": "aeead725-928a-44fe-b5b5-630c830502e0", "metadata": {}, "outputs": [], + "source": [ + "g.plot()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5f172d3e-108a-4a18-a88e-e012988013c5", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "b40b6dae-0770-4839-a392-7cf16bee65d6", + "metadata": {}, + "source": [ + "#### inspect contents of graphistry graph (nodes and edges): " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1fed46dd-2bb5-4563-9a21-920d299ba30d", + "metadata": {}, + "outputs": [], + "source": [ + "len(g._nodes), len(g._edges)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ad33fde2-3aae-4da0-8532-41d489c6fff1", + "metadata": {}, + "outputs": [], + "source": [ + "g._nodes.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "279f4cb9-daa1-4c2b-af69-85a5b0a771fb", + "metadata": {}, + "outputs": [], + "source": [ + "g._edges.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a60ee00-055d-4cee-b286-c5e3d6d85f09", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "11e41457-303c-4d5e-ae0e-7015db33d9f7", + "metadata": {}, + "outputs": [], "source": [] } ], "metadata": { "kernelspec": { - "display_name": "venv - spanner-test-5", + "display_name": "venv - spanner-test-1", "language": "python", - "name": "spanner-test-5" + "name": "spanner-test-1" }, "language_info": { "codemirror_mode": { diff --git a/graphistry/PlotterBase.py b/graphistry/PlotterBase.py index 3273e000e4..48b0a7739d 100644 --- a/graphistry/PlotterBase.py +++ b/graphistry/PlotterBase.py @@ -2272,17 +2272,24 @@ def bolt(self, driver): res = copy.copy(self) res._bolt_driver = to_bolt_driver(driver) return res - + + # TODO(tcook): add pydocs, typing def spanner_init(self, spanner_config): res = copy.copy(self) project_id = spanner_config["project_id"] instance_id = spanner_config["instance_id"] database_id = spanner_config["database_id"] - # TODO(tcook): throw an exception when any are missing? + + # check if valid + required_keys = ["project_id", "instance_id", "database_id"] + for key in required_keys: + value = spanner_config.get(key) + if not value: # checks for None or empty values + raise ValueError(f"Missing or invalid value for required Spanner configuration: '{key}'") res._spannergraph = SpannerGraph(res, project_id, instance_id, database_id) - print(f'DEBUG: created SpannerGraph object: {res._spannergraph} type(res): {type(res)}') + logger.debug("Created SpannerGraph object: {res._spannergraph}") return res def infer_labels(self): @@ -2481,7 +2488,7 @@ def spanner_query(self, query: str, params: Dict[str, Any] = {}) -> Plottable: query google spanner graph database and return Plottable with nodes and edges populated :param query: GQL query string :type query: Str - :returns: Plottable + :returns: Plottable with the results of GQL query as a graph :rtype: Plottable **Example: calling spanner_query @@ -2496,26 +2503,25 @@ def spanner_query(self, query: str, params: Dict[str, Any] = {}) -> Plottable: graphistry.register(..., spanner_config=SPANNER_CONF) g = graphistry.spanner_query("Graph MyGraph\nMATCH ()-[]->()" ) + + g.plot() """ - # is this needed? from .pygraphistry import PyGraphistry - print(f'DEBUG: PlotterBase.py spanner_query()') res = copy.copy(self) if res._spannergraph is None: spanner_config = PyGraphistry._config["spanner"] if spanner_config is not None: - print(f'DEBUG: Spanner Config: {spanner_config}') + logger.debug(f"Spanner Config: {spanner_config}") else: - print(f'DEBUG: PyGraphistry._config["spanner"] is None') + logger.debug(f'PyGraphistry._config["spanner"] is None') res = res.spanner_init(PyGraphistry._config["spanner"]) return res._spannergraph.gql_to_graph(query) else: - print(f'DEBUG: res._spannergraph is NOT None') return res._spannergraph.gql_to_graph(query) diff --git a/graphistry/plugins/spannergraph.py b/graphistry/plugins/spannergraph.py index 1bf889dd3b..01d096de01 100644 --- a/graphistry/plugins/spannergraph.py +++ b/graphistry/plugins/spannergraph.py @@ -2,10 +2,19 @@ import pandas as pd import json import time -import logging from typing import Any, List, Dict -# logging.basicConfig(level=logging.INFO) +from graphistry.util import setup_logger +logger = setup_logger(__name__) + +import logging +logging.basicConfig(level=logging.INFO) + +from google.cloud.spanner_v1.data_types import JsonObject + +class SpannerConnectionError(Exception): + """Custom exception for errors related to Spanner connection.""" + pass class SpannerQueryResult: """ @@ -26,7 +35,6 @@ def __init__(self, data: List[Any], execution_time: float): self.data = data self.execution_time = execution_time self.record_count = len(data) - print('DEBUG: SpannerQueryResults init()') def summary(self) -> Dict[str, Any]: """ @@ -64,19 +72,19 @@ def __init__(self, graphistry: Any, project_id: str, instance_id: str, database_ self.project_id = project_id self.instance_id = instance_id self.database_id = database_id - print('DEBUG: SpannerGraph init()') self.connection = self.__connect() - print(f'DEBUG: SpannerQueryResults connection = {self.connection}') def __connect(self) -> Any: """ Establishes a connection to the Spanner database. :return: A connection object to the Spanner database. + :rtype: google.cloud.spanner_dbapi.connection :raises SpannerConnectionError: If the connection to Spanner fails. """ + from google.cloud.spanner_dbapi.connection import connect + try: - from google.cloud.spanner_dbapi.connection import connect # Lazy import connection = connect(self.instance_id, self.database_id) connection.autocommit = True logging.info("Connected to Spanner database.") @@ -98,9 +106,10 @@ def execute_query(self, query: str) -> SpannerQueryResult: :param query: The GQL query to execute. :return: The results of the query execution. + :rtype: SpannerQueryResult :raises RuntimeError: If the query execution fails. """ - print(f'DEBUG: SpannerGraph execute_query() query:{query}') + logger.debug(f' SpannerGraph execute_query() query:{query}\n') try: start_time = time.time() @@ -114,79 +123,103 @@ def execute_query(self, query: str) -> SpannerQueryResult: raise RuntimeError(f"Query execution failed: {e}") @staticmethod - def parse_spanner_json(query_result: SpannerQueryResult) -> List[Dict[str, Any]]: - """ - Converts Spanner JSON graph data into structured Python objects. - - :param query_result: The results of the executed query. - :return: A list of dictionaries containing nodes and edges. - """ - from google.cloud.spanner_v1.data_types import JsonObject # Lazy import - data = [query_result.data] + def convert_spanner_json(data): + from google.cloud.spanner_v1.data_types import JsonObject json_list = [] - for record in data: - for item in record: + for item in data: + for elements in item: json_entry = {"nodes": [], "edges": []} - elements = json.loads(item.serialize()) if isinstance(item, JsonObject) else item for element in elements: - if element.get('kind') == 'node': - for label in element.get('labels', []): - json_entry["nodes"].append({ - "label": label, - "identifier": element.get('identifier'), - "properties": element.get('properties', {}) - }) - elif element.get('kind') == 'edge': - for label in element.get('labels', []): - json_entry["edges"].append({ - "label": label, - "identifier": element.get('identifier'), - "source": element.get('source_node_identifier'), - "destination": element.get('destination_node_identifier'), - "properties": element.get('properties', {}) - }) - if json_entry["nodes"] or json_entry["edges"]: + element_dict_list = json.loads(element.serialize()) if isinstance(element, JsonObject) else element + for element_dict in element_dict_list: + if element_dict.get('kind') == 'node': + labels = element_dict.get('labels', []) + for label in labels: + node_data = { + "label": label, + "identifier": element_dict.get('identifier'), + "properties": element_dict.get('properties', {}) + } + json_entry["nodes"].append(node_data) + elif element_dict.get('kind') == 'edge': + labels = element_dict.get('labels', []) + for label in labels: + edge_data = { + "label": label, + "identifier": element_dict.get('identifier'), + "source": element_dict.get('source_node_identifier'), + "destination": element_dict.get('destination_node_identifier'), + "properties": element_dict.get('properties') + } + json_entry["edges"].append(edge_data) + if json_entry["nodes"] or json_entry["edges"]: # only add non-empty entries json_list.append(json_entry) return json_list @staticmethod - def get_nodes_df(json_data: List[Dict[str, Any]]) -> pd.DataFrame: + def get_nodes_df(json_data: list) -> pd.DataFrame: """ - Converts graph nodes into a pandas DataFrame. - + Converts spanner json nodes into a pandas DataFrame. + :param json_data: The structured JSON data containing graph nodes. - :return: A DataFrame containing node information. + :return: A DataFrame containing node information + :rtype: pd.DataFrame """ nodes = [ - {"label": node["label"], "identifier": node["identifier"], **node["properties"]} + { + "label": node.get("label"), + "identifier": node["identifier"], + **node.get("properties", {}) + } for entry in json_data - for node in entry["nodes"] + for node in entry.get("nodes", []) ] nodes_df = pd.DataFrame(nodes).drop_duplicates() - nodes_df['type'] = nodes_df['label'] + + # if 'type' property exists, skip setting and warn + if "type" not in nodes_df.columns: + # check 'label' column exists before assigning it to 'type' + if "label" in nodes_df.columns: + nodes_df['type'] = nodes_df['label'] + else: + nodes_df['type'] = None # Assign a default value if 'label' is missing + else: + logger.warn("unable to assign 'type' from label, column exists\n") + return nodes_df @staticmethod - def get_edges_df(json_data: List[Dict[str, Any]]) -> pd.DataFrame: + def get_edges_df(json_data: list) -> pd.DataFrame: """ - Converts graph edges into a pandas DataFrame. + Converts spanner json edges into a pandas DataFrame :param json_data: The structured JSON data containing graph edges. :return: A DataFrame containing edge information. + :rtype: pd.DataFrame """ edges = [ { - "label": edge["label"], + "label": edge.get("label"), "identifier": edge["identifier"], "source": edge["source"], "destination": edge["destination"], - **edge["properties"] + **edge.get("properties", {}) } for entry in json_data - for edge in entry["edges"] + for edge in entry.get("edges", []) ] edges_df = pd.DataFrame(edges).drop_duplicates() - edges_df['type'] = edges_df['label'] + + # if 'type' property exists, skip setting and warn + if "type" not in edges_df.columns: + # check 'label' column exists before assigning it to 'type' + if "label" in edges_df.columns: + edges_df['type'] = edges_df['label'] + else: + edges_df['type'] = None # Assign a default value if 'label' is missing + else: + logger.warn("unable to assign 'type' from label, column exists\n") + return edges_df def gql_to_graph(self, query: str) -> Any: @@ -197,12 +230,17 @@ def gql_to_graph(self, query: str) -> Any: :return: A Graphistry graph object constructed from the query results. """ query_result = self.execute_query(query) - json_data = self.parse_spanner_json(query_result) + # convert json result set to a list + query_result_list = [ query_result.data ] + json_data = self.convert_spanner_json(query_result_list) nodes_df = self.get_nodes_df(json_data) edges_df = self.get_edges_df(json_data) + # TODO(tcook): add more error handling here if nodes or edges are empty g = self.graphistry.nodes(nodes_df, 'identifier').edges(edges_df, 'source', 'destination') return g + # TODO(tcook): add wrapper funcs in PlotterBase for these utility functions: + def get_schema(self) -> Dict[str, List[Dict[str, str]]]: """ Retrieves the schema of the Spanner database. diff --git a/graphistry/pygraphistry.py b/graphistry/pygraphistry.py index cef620d3c4..50a320a51c 100644 --- a/graphistry/pygraphistry.py +++ b/graphistry/pygraphistry.py @@ -738,7 +738,6 @@ def register( PyGraphistry.store_token_creds_in_memory(store_token_creds_in_memory) PyGraphistry.set_bolt_driver(bolt) PyGraphistry.set_spanner_config(spanner_config) - PyGraphistry.spanner_init(spanner_config) # Reset token creds PyGraphistry.__reset_token_creds_in_memory() From 70a038d7aa0ecc247e0d81b17904de143c6a9715 Mon Sep 17 00:00:00 2001 From: Thomas Cook Date: Thu, 16 Jan 2025 01:03:21 -0600 Subject: [PATCH 13/33] fixed register and uncomment gcloud --- .../spanner/google_spanner_finance_graph.ipynb | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/demos/demos_databases_apis/spanner/google_spanner_finance_graph.ipynb b/demos/demos_databases_apis/spanner/google_spanner_finance_graph.ipynb index e2d4a42713..aba918a760 100644 --- a/demos/demos_databases_apis/spanner/google_spanner_finance_graph.ipynb +++ b/demos/demos_databases_apis/spanner/google_spanner_finance_graph.ipynb @@ -17,7 +17,7 @@ "id": "1407509d-f079-4f66-bc5f-bd93c9510d09", "metadata": {}, "source": [ - "### settings" + "### Settings" ] }, { @@ -48,7 +48,7 @@ "id": "97649200-c7ab-4041-bb19-f7ab6363ead3", "metadata": {}, "source": [ - "### graphistry register and gcloud init" + "### Graphistry register and gcloud init" ] }, { @@ -62,7 +62,15 @@ "\n", "# To specify Graphistry account & server, use:\n", "# graphistry.register(api=3, username='...', password='...', protocol='https', server='hub.graphistry.com')\n", - "# For more options, see https://pygraphistry.readthedocs.io/en/latest/server/register.html\n" + "# For more options, see https://pygraphistry.readthedocs.io/en/latest/server/register.html\n", + "\n", + "graphistry.register(api=3, \n", + " protocol = \"http\", \n", + " server = os.getenv(\"GRAPHISTRY_SERVER\"),\n", + " username = os.getenv(\"GRAPHISTRY_USERNAME\"), \n", + " password = os.getenv(\"GRAPHISTRY_PASSWORD\"), \n", + " spanner_conf=SPANNER_CONF\n", + " )" ] }, { @@ -84,7 +92,7 @@ "metadata": {}, "outputs": [], "source": [ - "#!gcloud auth application-default login" + "!gcloud auth application-default login" ] }, { From f3742792ca284123ea08052327c24be07797529e Mon Sep 17 00:00:00 2001 From: Thomas Cook Date: Thu, 16 Jan 2025 01:06:51 -0600 Subject: [PATCH 14/33] fixed typo in register --- .../spanner/google_spanner_finance_graph.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/demos/demos_databases_apis/spanner/google_spanner_finance_graph.ipynb b/demos/demos_databases_apis/spanner/google_spanner_finance_graph.ipynb index aba918a760..b19a2cee6a 100644 --- a/demos/demos_databases_apis/spanner/google_spanner_finance_graph.ipynb +++ b/demos/demos_databases_apis/spanner/google_spanner_finance_graph.ipynb @@ -69,7 +69,7 @@ " server = os.getenv(\"GRAPHISTRY_SERVER\"),\n", " username = os.getenv(\"GRAPHISTRY_USERNAME\"), \n", " password = os.getenv(\"GRAPHISTRY_PASSWORD\"), \n", - " spanner_conf=SPANNER_CONF\n", + " spanner_config=SPANNER_CONF\n", " )" ] }, From 116334a3988874ca743b73654e3cf0e3a2ababa4 Mon Sep 17 00:00:00 2001 From: Thomas Cook Date: Thu, 16 Jan 2025 20:12:00 -0600 Subject: [PATCH 15/33] added spanner_query_to_df and other fixes from PR comments --- .../google_spanner_finance_graph.ipynb | 99 ++++++++-- graphistry/PlotterBase.py | 108 +++++++++-- graphistry/__init__.py | 3 +- graphistry/plugins/spannergraph.py | 179 ++++++++---------- graphistry/pygraphistry.py | 99 +++++++++- 5 files changed, 349 insertions(+), 139 deletions(-) diff --git a/demos/demos_databases_apis/spanner/google_spanner_finance_graph.ipynb b/demos/demos_databases_apis/spanner/google_spanner_finance_graph.ipynb index b19a2cee6a..3039268ba8 100644 --- a/demos/demos_databases_apis/spanner/google_spanner_finance_graph.ipynb +++ b/demos/demos_databases_apis/spanner/google_spanner_finance_graph.ipynb @@ -95,31 +95,50 @@ "!gcloud auth application-default login" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "11e41457-303c-4d5e-ae0e-7015db33d9f7", + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "markdown", - "id": "56bc01a7-76ea-44f6-b1cc-c5488c5c5922", + "id": "88eb24b0-7d3b-4629-813c-fc989ba1ea90", "metadata": {}, "source": [ - "### Spanner GQL Query to Graphistry Visualization" + "### Example 1: GQL Path Query to Graphistry Visualization of all nodes and edges (LIMIT optional) \n", + "\n", + "to extract the data from Spanner Graph as a graph with nodes and edges in a single object, a GQL path query is required. \n", + "\n", + "The format of a path query is as follows, note the p= at the start of the MATCH clause, and the SAFE_TO_JSON(p) without these, \n", + "the query will not produce the results needed to properly load a graphistry graph. LIMIT is optional, but for large graphs with millions\n", + " of edges or more, it's best to filter either in the query or use LIMIT so as not to exhaust GPU memory. \n", + "\n", + "```python\n", + "GRAPH FinGraph\n", + "MATCH p = (a)-[b]->(c) where 1=1 LIMIT 10000 return SAFE_TO_JSON(p) as path\n", + "```\n" ] }, { "cell_type": "code", "execution_count": null, - "id": "e7ab1379-14bf-4c03-b5b2-28df22609029", + "id": "58ee08b2-29e1-47db-b0a8-440f6171e54d", "metadata": {}, "outputs": [], "source": [ "query=f'''GRAPH FinGraph\n", - "MATCH p = (a)-[b]->(c) where 1=1 {LIMIT_CLAUSE} return TO_JSON(p) as path'''\n", + "MATCH p = (a)-[b]->(c) where 1=1 {LIMIT_CLAUSE} return SAFE_TO_JSON(p) as path'''\n", "\n", - "g = graphistry.spanner_query(query)" + "g = graphistry.spanner_gql_to_g(query)" ] }, { "cell_type": "code", "execution_count": null, - "id": "aeead725-928a-44fe-b5b5-630c830502e0", + "id": "3d606a3f-e807-4fa7-893e-52a95d238cc0", "metadata": {}, "outputs": [], "source": [ @@ -129,23 +148,23 @@ { "cell_type": "code", "execution_count": null, - "id": "5f172d3e-108a-4a18-a88e-e012988013c5", + "id": "ae275af9-f354-454b-bbeb-e423ae4acfba", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", - "id": "b40b6dae-0770-4839-a392-7cf16bee65d6", + "id": "b9d18502-9bc1-4f7e-af3a-b99f7d848e08", "metadata": {}, "source": [ - "#### inspect contents of graphistry graph (nodes and edges): " + "#### Example 1.1 - inspect contents of graphistry graph (nodes and edges): " ] }, { "cell_type": "code", "execution_count": null, - "id": "1fed46dd-2bb5-4563-9a21-920d299ba30d", + "id": "42e451d7-8f97-45a8-bb45-7c4271f11f68", "metadata": {}, "outputs": [], "source": [ @@ -155,7 +174,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ad33fde2-3aae-4da0-8532-41d489c6fff1", + "id": "e5078632-24b2-4139-8e4f-a2c18f1efd94", "metadata": {}, "outputs": [], "source": [ @@ -165,7 +184,7 @@ { "cell_type": "code", "execution_count": null, - "id": "279f4cb9-daa1-4c2b-af69-85a5b0a771fb", + "id": "376bf2a7-931a-4c3f-bfda-3413734e5ad7", "metadata": {}, "outputs": [], "source": [ @@ -175,15 +194,67 @@ { "cell_type": "code", "execution_count": null, - "id": "2a60ee00-055d-4cee-b286-c5e3d6d85f09", + "id": "05d8b293-62b7-4056-b924-ff04284559b9", "metadata": {}, "outputs": [], "source": [] }, + { + "cell_type": "markdown", + "id": "0013cceb-f32f-48a0-9f02-6b75a2704294", + "metadata": {}, + "source": [ + "### Example 2: Spanner GQL Query to pandas dataframe (LIMIT optional) \n", + "\n", + "This example shows a non-path query that returns tabular results, which are then convered to a dataframe for easy manipulation and inspection of the results. \n", + "\n", + "```python\n", + "GRAPH FinGraph \n", + "MATCH (p:Person)-[]-()->(l:Loan)\n", + "RETURN p.id as ID, p.name AS Name, SUM(l.loan_amount) AS TotalBorrowed\n", + "ORDER BY TotalBorrowed DESC\n", + "LIMIT 10```\n", + "\n" + ] + }, { "cell_type": "code", "execution_count": null, - "id": "11e41457-303c-4d5e-ae0e-7015db33d9f7", + "id": "523f57b2-d09f-4aa2-8626-5af74d5d9a20", + "metadata": {}, + "outputs": [], + "source": [ + "query_top10='''GRAPH FinGraph \n", + "MATCH (p:Person)-[]-()->(l:Loan) WHERE 1=1\n", + "RETURN p.id as ID, p.name AS Name, SUM(l.loan_amount) AS TotalBorrowed\n", + "ORDER BY TotalBorrowed DESC\n", + "LIMIT 10'''" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a51917d3-6c17-4438-981d-b7d441ec89ec", + "metadata": {}, + "outputs": [], + "source": [ + "Top10_Borrowers_df = graphistry.spanner_query_to_df(query_top10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aab6fb57-6f09-41ae-927a-4f7b5f47db19", + "metadata": {}, + "outputs": [], + "source": [ + "Top10_Borrowers_df.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7a55789a-16ed-4b6f-b2d0-e374999f1806", "metadata": {}, "outputs": [], "source": [] diff --git a/graphistry/PlotterBase.py b/graphistry/PlotterBase.py index 48b0a7739d..ddc1ac5b09 100644 --- a/graphistry/PlotterBase.py +++ b/graphistry/PlotterBase.py @@ -2273,22 +2273,38 @@ def bolt(self, driver): res._bolt_driver = to_bolt_driver(driver) return res - # TODO(tcook): add pydocs, typing - def spanner_init(self, spanner_config): + def spanner_init(self: Plottable, spanner_config: Dict[str, str]) -> Plottable: + """ + Initializes a SpannerGraph object with the provided configuration and connects to the instance db + + spanner_config dict must contain the include the following keys, credentials_file is optional: + - "project_id": The GCP project ID. + - "instance_id": The Spanner instance ID. + - "database_id": The Spanner database ID. + - "credentials_file": json file API key for service accounts + + :param spanner_config A dictionary containing the Spanner configuration. + :type (Dict[str, str]) + :return: Plottable with a Spanner connection + :rtype: Plottable + :raises ValueError: If any of the required keys in `spanner_config` are missing or have invalid values. + + """ res = copy.copy(self) project_id = spanner_config["project_id"] instance_id = spanner_config["instance_id"] database_id = spanner_config["database_id"] + credentials_file = spanner_config["credentials_file"] # check if valid required_keys = ["project_id", "instance_id", "database_id"] for key in required_keys: value = spanner_config.get(key) - if not value: # checks for None or empty values + if not value: # check for None or empty values raise ValueError(f"Missing or invalid value for required Spanner configuration: '{key}'") - res._spannergraph = SpannerGraph(res, project_id, instance_id, database_id) + res._spannergraph = SpannerGraph(res, project_id, instance_id, database_id, credentials_file) logger.debug("Created SpannerGraph object: {res._spannergraph}") return res @@ -2481,28 +2497,91 @@ def cypher(self, query: str, params: Dict[str, Any] = {}) -> Plottable: .edges(edges) - def spanner_query(self, query: str, params: Dict[str, Any] = {}) -> Plottable: + def spanner_gql_to_g(self: Plottable, query: str) -> Plottable: """ - TODO(tcook): maybe rename to spanner_query_gql since spanner supports multiple languages. SQL, GQL, etc + Submit GQL query to google spanner graph database and return Plottable with nodes and edges populated + + GQL must be a path query with a syntax similar to the following, it's recommended to return the path with + SAFE_TO_JSON(p), TO_JSON() can also be used, but not recommend. LIMIT is optional, but for large graphs with millions + of edges or more, it's best to filter either in the query or use LIMIT so as not to exhaust GPU memory. + + query=f'''GRAPH my_graph + MATCH p = (a)-[b]->(c) LIMIT 100000 return SAFE_TO_JSON(p) as path''' - query google spanner graph database and return Plottable with nodes and edges populated :param query: GQL query string :type query: Str + :returns: Plottable with the results of GQL query as a graph :rtype: Plottable - **Example: calling spanner_query + **Example: calling spanner_gql_to_g + :: + + import graphistry + + # credentials_file is optional, all others are required + SPANNER_CONF = { "project_id": PROJECT_ID, + "instance_id": INSTANCE_ID, + "database_id": DATABASE_ID, + "credentials_file": CREDENTIALS_FILE } + + graphistry.register(..., spanner_config=SPANNER_CONF) + + query=f'''GRAPH my_graph + MATCH p = (a)-[b]->(c) LIMIT 100000 return SAFE_TO_JSON(p) as path''' + + g = graphistry.spanner_gql_to_g(query) + + g.plot() + + """ + + from .pygraphistry import PyGraphistry + + res = copy.copy(self) + + if res._spannergraph is None: + spanner_config = PyGraphistry._config["spanner"] + if spanner_config is not None: + logger.debug(f"Spanner Config: {spanner_config}") + else: + logger.warn(f'PyGraphistry._config["spanner"] is None') + + res = res.spanner_init(PyGraphistry._config["spanner"]) + + return res._spannergraph.gql_to_graph(query) + + def spanner_query_to_df(self: Plottable, query: str) -> pd.DataFrame: + """ + + Submit query to google spanner database and return a df of the results + + query can be SQL or GQL as long as table of results are returned + + query='SELECT * from Account limit 10000' + + :param query: query string + :type query: Str + + :returns: Pandas DataFrame with the results of query + :rtype: pd.DataFrame + + **Example: calling spanner_query_to_df :: import graphistry - SPANNER_CONF = { "project_id": PROJECT_ID, + # credentials_file is optional, all others are required + SPANNER_CONF = { "project_id": PROJECT_ID, "instance_id": INSTANCE_ID, - "database_id": DATABASE_ID } + "database_id": DATABASE_ID, + "credentials_file": CREDENTIALS_FILE } graphistry.register(..., spanner_config=SPANNER_CONF) - g = graphistry.spanner_query("Graph MyGraph\nMATCH ()-[]->()" ) + query='SELECT * from Account limit 10000' + + df = graphistry.spanner_query_to_df(query) g.plot() @@ -2517,12 +2596,11 @@ def spanner_query(self, query: str, params: Dict[str, Any] = {}) -> Plottable: if spanner_config is not None: logger.debug(f"Spanner Config: {spanner_config}") else: - logger.debug(f'PyGraphistry._config["spanner"] is None') + logger.warn(f'PyGraphistry._config["spanner"] is None') res = res.spanner_init(PyGraphistry._config["spanner"]) - return res._spannergraph.gql_to_graph(query) - else: - return res._spannergraph.gql_to_graph(query) + + return res._spannergraph.query_to_df(query) def nodexl(self, xls_or_url, source='default', engine=None, verbose=False): diff --git a/graphistry/__init__.py b/graphistry/__init__.py index 5ff53900b5..04e255de89 100644 --- a/graphistry/__init__.py +++ b/graphistry/__init__.py @@ -31,7 +31,8 @@ bolt, cypher, tigergraph, - spanner_query, + spanner_gql_to_g, + spanner_query_to_df, gsql, gsql_endpoint, cosmos, diff --git a/graphistry/plugins/spannergraph.py b/graphistry/plugins/spannergraph.py index 01d096de01..51323ad7e7 100644 --- a/graphistry/plugins/spannergraph.py +++ b/graphistry/plugins/spannergraph.py @@ -4,13 +4,11 @@ import time from typing import Any, List, Dict +from graphistry.Plottable import Plottable + from graphistry.util import setup_logger logger = setup_logger(__name__) -import logging -logging.basicConfig(level=logging.INFO) - -from google.cloud.spanner_v1.data_types import JsonObject class SpannerConnectionError(Exception): """Custom exception for errors related to Spanner connection.""" @@ -21,31 +19,19 @@ class SpannerQueryResult: Encapsulates the results of a query, including metadata. :ivar list data: The raw query results. - :ivar float execution_time: The time taken to execute the query. - :ivar int record_count: The number of records returned. """ - def __init__(self, data: List[Any], execution_time: float): + def __init__(self, data: List[Any], column_names: List[str]=None): """ Initializes a SpannerQueryResult instance. :param data: The raw query results. - :param execution_time: The time taken to execute the query. + :type List[Any] + :param column_names: a list of the column names from the cursor, defaults to None + :type: List[str], optional """ self.data = data - self.execution_time = execution_time - self.record_count = len(data) - - def summary(self) -> Dict[str, Any]: - """ - Provides a summary of the query execution. - - :return: A summary of the query results. - """ - return { - "execution_time": self.execution_time, - "record_count": self.record_count - } + self.column_names = column_names class SpannerGraph: @@ -59,7 +45,7 @@ class SpannerGraph: :ivar Any graphistry: The Graphistry parent object. """ - def __init__(self, graphistry: Any, project_id: str, instance_id: str, database_id: str): + def __init__(self, g: Plottable, project_id: str, instance_id: str, database_id: str, credentials_file: str=None): """ Initializes the SpannerGraph instance. @@ -68,10 +54,11 @@ def __init__(self, graphistry: Any, project_id: str, instance_id: str, database_ :param instance_id: The Spanner instance ID. :param database_id: The Spanner database ID. """ - self.graphistry = graphistry + self.g = g self.project_id = project_id self.instance_id = instance_id self.database_id = database_id + self.credentials_file = credentials_file self.connection = self.__connect() def __connect(self) -> Any: @@ -85,9 +72,13 @@ def __connect(self) -> Any: from google.cloud.spanner_dbapi.connection import connect try: - connection = connect(self.instance_id, self.database_id) + if self.credentials_file: + connection = connect(self.instance_id, self.database_id, credentials=self.credentials_file) + else: + connection = connect(self.instance_id, self.database_id) + connection.autocommit = True - logging.info("Connected to Spanner database.") + logger.info("Connected to Spanner database.") return connection except Exception as e: raise SpannerConnectionError(f"Failed to connect to Spanner: {e}") @@ -98,13 +89,14 @@ def close_connection(self) -> None: """ if self.connection: self.connection.close() - logging.info("Connection to Spanner database closed.") + logger.info("Connection to Spanner database closed.") def execute_query(self, query: str) -> SpannerQueryResult: """ Executes a GQL query on the Spanner database. - :param query: The GQL query to execute. + :param query: The GQL query to execute + :type str :return: The results of the query execution. :rtype: SpannerQueryResult :raises RuntimeError: If the query execution fails. @@ -116,14 +108,16 @@ def execute_query(self, query: str) -> SpannerQueryResult: cursor = self.connection.cursor() cursor.execute(query) results = cursor.fetchall() - execution_time = time.time() - start_time - logging.info(f"Query executed in {execution_time:.4f} seconds.") - return SpannerQueryResult(results, execution_time) + column_names = [desc[0] for desc in cursor.description] # extract column names + logger.debug(f'column names returned from query: {column_names}') + execution_time_s = time.time() - start_time + logger.info(f"Query completed in {execution_time_s:.3f} seconds.") + return SpannerQueryResult(results, column_names) except Exception as e: raise RuntimeError(f"Query execution failed: {e}") @staticmethod - def convert_spanner_json(data): + def convert_spanner_json(data: List[Any]) -> List[Dict[str, Any]]: from google.cloud.spanner_v1.data_types import JsonObject json_list = [] for item in data: @@ -156,6 +150,35 @@ def convert_spanner_json(data): json_list.append(json_entry) return json_list + @staticmethod + def add_type_from_label_to_df(df: pd.DataFrame) -> pd.DataFrame: + """ + Modify input DataFrame creating a 'type' column is created from 'label' for proper type handling in Graphistry + If a 'type' column already exists, it is renamed to 'type_' before creating the new 'type' column. + + :param df: DataFrame containing node or edge data + :type df: pd.DataFrame + + :return: Modified DataFrame with the updated 'type' column. + :rtype: query: pd.DataFrame + + """ + + # rename 'type' to 'type_' if it exists + if "type" in df.columns: + df.rename(columns={"type": "type_"}, inplace=True) + logger.info("'type' column renamed to 'type_'") + + # check if 'label' column exists before assigning it to 'type' + if "label" in df.columns: + df["type"] = df["label"] + else: + # assign None value if 'label' is missing + df["type"] = None + logger.warn("'label' column missing, 'type' set to None") + + return df + @staticmethod def get_nodes_df(json_data: list) -> pd.DataFrame: """ @@ -176,26 +199,20 @@ def get_nodes_df(json_data: list) -> pd.DataFrame: ] nodes_df = pd.DataFrame(nodes).drop_duplicates() - # if 'type' property exists, skip setting and warn - if "type" not in nodes_df.columns: - # check 'label' column exists before assigning it to 'type' - if "label" in nodes_df.columns: - nodes_df['type'] = nodes_df['label'] - else: - nodes_df['type'] = None # Assign a default value if 'label' is missing - else: - logger.warn("unable to assign 'type' from label, column exists\n") - - return nodes_df + return SpannerGraph.add_type_from_label_to_df(nodes_df) + + @staticmethod def get_edges_df(json_data: list) -> pd.DataFrame: """ Converts spanner json edges into a pandas DataFrame :param json_data: The structured JSON data containing graph edges. + :type list :return: A DataFrame containing edge information. :rtype: pd.DataFrame + """ edges = [ { @@ -210,85 +227,41 @@ def get_edges_df(json_data: list) -> pd.DataFrame: ] edges_df = pd.DataFrame(edges).drop_duplicates() - # if 'type' property exists, skip setting and warn - if "type" not in edges_df.columns: - # check 'label' column exists before assigning it to 'type' - if "label" in edges_df.columns: - edges_df['type'] = edges_df['label'] - else: - edges_df['type'] = None # Assign a default value if 'label' is missing - else: - logger.warn("unable to assign 'type' from label, column exists\n") + return SpannerGraph.add_type_from_label_to_df(edges_df) - return edges_df - def gql_to_graph(self, query: str) -> Any: + def gql_to_graph(self, query: str) -> Plottable: """ Executes a query and constructs a Graphistry graph from the results. :param query: The GQL query to execute. :return: A Graphistry graph object constructed from the query results. + :rtype: Plottable """ query_result = self.execute_query(query) + # convert json result set to a list query_result_list = [ query_result.data ] + json_data = self.convert_spanner_json(query_result_list) + nodes_df = self.get_nodes_df(json_data) edges_df = self.get_edges_df(json_data) - # TODO(tcook): add more error handling here if nodes or edges are empty - g = self.graphistry.nodes(nodes_df, 'identifier').edges(edges_df, 'source', 'destination') - return g - # TODO(tcook): add wrapper funcs in PlotterBase for these utility functions: - - def get_schema(self) -> Dict[str, List[Dict[str, str]]]: - """ - Retrieves the schema of the Spanner database. + # TODO(tcook): add more error handling here if nodes or edges are empty + return self.g.nodes(nodes_df, 'identifier').edges(edges_df, 'source', 'destination') - :return: A dictionary containing table names and column details. + def query_to_df(self, query: str) -> pd.DataFrame: """ - schema = {} - try: - cursor = self.connection.cursor() - cursor.execute("SELECT table_name, column_name, spanner_type FROM information_schema.columns") - for row in cursor.fetchall(): - table_name, column_name, spanner_type = row - if table_name not in schema: - schema[table_name] = [] - schema[table_name].append({"column_name": column_name, "type": spanner_type}) - logging.info("Database schema retrieved successfully.") - except Exception as e: - logging.error(f"Failed to retrieve schema: {e}") - return schema + Executes a query and returns a pandas dataframe of results - def validate_data(self, data: Dict[str, List[Dict[str, Any]]], schema: Dict[str, List[Dict[str, str]]]) -> bool: + :param query: The query to execute. + :return: pandas dataframe of the query results + :rtype: pd.DataFrame """ - Validates input data against the database schema. + query_result = self.execute_query(query) - :param data: The data to validate. - :param schema: The schema of the database. - :return: True if the data is valid, False otherwise. - """ - for table, columns in data.items(): - if table not in schema: - logging.error(f"Table {table} does not exist in schema.") - return False - for record in columns: - for key in record.keys(): - if key not in [col["column_name"] for col in schema[table]]: - logging.error(f"Column {key} is not valid for table {table}.") - return False - logging.info("Data validation passed.") - return True - - def dump_config(self) -> Dict[str, str]: - """ - Returns the current configuration of the SpannerGraph instance. + # create DataFrame from json results, adding column names + df = pd.DataFrame(query_result.data, columns=query_result.column_names) - :return: A dictionary containing configuration details. - """ - return { - "project_id": self.project_id, - "instance_id": self.instance_id, - "database_id": self.database_id - } + return df diff --git a/graphistry/pygraphistry.py b/graphistry/pygraphistry.py index 50a320a51c..7c1ef90662 100644 --- a/graphistry/pygraphistry.py +++ b/graphistry/pygraphistry.py @@ -580,7 +580,7 @@ def set_spanner_config(spanner_config): :returns: None. :rtype: None - **Example: calling set_spanner_config** + **Example: calling set_spanner_config - all keys are required** :: import graphistry @@ -591,7 +591,20 @@ def set_spanner_config(spanner_config): "database_id": DATABASE_ID } graphistry.set_spanner_config(SPANNER_CONF) - + + **Example: calling set_spanner_config with credentials_file (optional) - used for service accounts** + :: + + import graphistry + graphistry.register(...) + + SPANNER_CONF = { "project_id": PROJECT_ID, + "instance_id": INSTANCE_ID, + "database_id": DATABASE_ID, + "credentials_file": CREDENTIALS_FILE } + + graphistry.set_spanner_config(SPANNER_CONF) + """ if spanner_config is not None: @@ -1883,10 +1896,83 @@ def tigergraph( ) @staticmethod - def spanner_query(query: str, params: Dict[str, Any] = {}) -> Plottable: - # TODO(tcook): add pydocs - return Plotter().spanner_query(query, params) + def spanner_gql_to_g(query: str) -> Plottable: + """ + Submit GQL query to google spanner graph database and return Plottable with nodes and edges populated + + GQL must be a path query with a syntax similar to the following, it's recommended to return the path with + SAFE_TO_JSON(p), TO_JSON() can also be used, but not recommend. LIMIT is optional, but for large graphs with millions + of edges or more, it's best to filter either in the query or use LIMIT so as not to exhaust GPU memory. + + query=f'''GRAPH my_graph + MATCH p = (a)-[b]->(c) LIMIT 100000 return SAFE_TO_JSON(p) as path''' + + :param query: GQL query string + :type query: Str + + :returns: Plottable with the results of GQL query as a graph + :rtype: Plottable + + **Example: calling spanner_gql_to_g + :: + + import graphistry + + # credentials_file is optional, all others are required + SPANNER_CONF = { "project_id": PROJECT_ID, + "instance_id": INSTANCE_ID, + "database_id": DATABASE_ID, + "credentials_file": CREDENTIALS_FILE } + + graphistry.register(..., spanner_config=SPANNER_CONF) + + query=f'''GRAPH my_graph + MATCH p = (a)-[b]->(c) LIMIT 100000 return SAFE_TO_JSON(p) as path''' + + g = graphistry.spanner_gql_to_g(query) + g.plot() + + """ + return Plotter().spanner_gql_to_g(query) + + @staticmethod + def spanner_query_to_df(query: str) -> pd.DataFrame: + """ + + Submit query to google spanner database and return a df of the results + + query can be SQL or GQL as long as table of results are returned + + query='SELECT * from Account limit 10000' + + :param query: query string + :type query: Str + + :returns: Pandas DataFrame with the results of query + :rtype: pd.DataFrame + + **Example: calling spanner_query_to_df + :: + + import graphistry + + # credentials_file is optional, all others are required + SPANNER_CONF = { "project_id": PROJECT_ID, + "instance_id": INSTANCE_ID, + "database_id": DATABASE_ID, + "credentials_file": CREDENTIALS_FILE } + + graphistry.register(..., spanner_config=SPANNER_CONF) + + query='SELECT * from Account limit 10000' + + df = graphistry.spanner_query_to_df(query) + + g.plot() + + """ + return Plotter().spanner_query_to_df(query) @staticmethod def gsql_endpoint( @@ -2549,7 +2635,8 @@ def _handle_api_response(response): cypher = PyGraphistry.cypher nodexl = PyGraphistry.nodexl tigergraph = PyGraphistry.tigergraph -spanner_query = PyGraphistry.spanner_query +spanner_gql_to_g = PyGraphistry.spanner_gql_to_g +spanner_query_to_df = PyGraphistry.spanner_query_to_df cosmos = PyGraphistry.cosmos neptune = PyGraphistry.neptune gremlin = PyGraphistry.gremlin From ed364eeba3883572465d19c90e685e3761c5d4b8 Mon Sep 17 00:00:00 2001 From: Thomas Cook Date: Fri, 17 Jan 2025 03:17:16 -0600 Subject: [PATCH 16/33] updated notebook with more examples --- .../google_spanner_finance_graph.ipynb | 332 ++++++++++++++++-- 1 file changed, 309 insertions(+), 23 deletions(-) diff --git a/demos/demos_databases_apis/spanner/google_spanner_finance_graph.ipynb b/demos/demos_databases_apis/spanner/google_spanner_finance_graph.ipynb index 3039268ba8..af9dee4587 100644 --- a/demos/demos_databases_apis/spanner/google_spanner_finance_graph.ipynb +++ b/demos/demos_databases_apis/spanner/google_spanner_finance_graph.ipynb @@ -1,5 +1,82 @@ { "cells": [ + { + "cell_type": "markdown", + "id": "19a73e4f-c046-4ec9-acb7-472e4888e810", + "metadata": {}, + "source": [ + "\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \"Graphistry\n", + " \n", + " \"Google\n", + "
\n" + ] + }, + { + "cell_type": "markdown", + "id": "814017ae-af4e-4331-b0b0-ed016d010ade", + "metadata": {}, + "source": [ + "## Demo Notebook - Graphistry and Google Spanner Graph \n", + "\n", + "This interactive guide demonstrates how to combine the power of Graphistry's visual graph analytics and AI with the robust data capabilities of Google Cloud Spanner Graph. \n", + "\n", + "#### Why Graphistry + Google Cloud Spanner?\n", + "\n", + "Graphistry is a cutting-edge platform for large-scale visual graph exploration and analysis. It enables users to intuitively investigate complex relationships, patterns and anomalies across vast datasets through highly interactive, GPU-accelerated visualizations. Google Cloud Spanner, on the other hand, is a globally distributed, horizontally scalable, and strongly consistent database ideal for managing large, interconnected datasets.\n", + "\n", + "Together, these technologies empower you to:\n", + "- **Visualize Complex Graphs**: Easily explore relationships and uncover insights in your data through rich visual representations.\n", + "- **Handle Large Datasets**: Leverage Cloud Spanner’s ability to manage vast amounts of interconnected information with strong consistency and scalability.\n", + "- **Perform Advanced Analytics**: Apply graph-based algorithms and clustering techniques to extract actionable insights from structured data.\n", + "\n", + "#### What Will You Learn?\n", + "\n", + "This notebook showcases:\n", + "1. **Connecting to Cloud Spanner**: How to retrieve and preprocess data from Cloud Spanner for graph processing.\n", + "2. **Graph Visualization with Graphistry**: Turning raw data into meaningful visualizations to explore relationships and clusters.\n", + "3. **Real-World Use Cases**: Applying these tools to solve practical problems such as anomaly detection, recommendation systems, and network optimization.\n", + "\n", + "#### Who Is This For?\n", + "\n", + "This demo is designed for:\n", + "- **Data Scientists**: Interested in adding visual graph analytics to their toolkit.\n", + "- **Database Engineers**: Looking to integrate graph capabilities into their Cloud Spanner workflows.\n", + "- **Decision Makers**: Exploring actionable insights from complex datasets.\n", + "\n", + "#### Prerequisites\n", + "\n", + "To follow along, you’ll need:\n", + "- A Google Cloud account with access to Cloud Spanner.\n", + "- A Graphistry Enterprise Server or free-tier [Graphistry Hub account](https://www.graphistry.com/get-started) \n", + "- Python environment with Graphistry installed.\n", + "- Basic knowledge of SQL, GQL and graph concepts.\n", + "- This demo is based on [FinGraph sample graph](https://codelabs.developers.google.com/codelabs/spanner-graph-getting-started#0)\n", + "\n", + "#### Let’s Get Started!\n", + "\n", + "Dive in and see how the synergy of Graphistry and Google Cloud Spanner can transform your data exploration and analysis workflows. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "970195db-a67f-4cb0-b5d0-6927ecebb166", + "metadata": {}, + "outputs": [], + "source": [ + "# intall required dependecies for graphistry + google spanner \n", + "!pip install graphistry[spanner]" + ] + }, { "cell_type": "code", "execution_count": null, @@ -8,7 +85,6 @@ "outputs": [], "source": [ "import graphistry\n", - "import json\n", "graphistry.__version__" ] }, @@ -36,11 +112,18 @@ "# or use: \n", "# LIMIT_CLAUSE = \"limit 1000\" \n", "\n", + "# google settings, option 1: interactive login using gcloud auth application-default login (below)\n", "SPANNER_CONF = { \"project_id\": PROJECT_ID, \n", " \"instance_id\": INSTANCE_ID, \n", " \"database_id\": DATABASE_ID }\n", "\n", - "# print(json.dumps(SPANNER_CONF, indent=4))" + "# google settings, option 2: use a service account key: \n", + "\n", + "# KEY_FILE=\"/path/to/credentials.json\"\n", + "# SPANNER_CONF = { \"project_id\": PROJECT_ID, \n", + "# \"instance_id\": INSTANCE_ID, \n", + "# \"database_id\": DATABASE_ID, \n", + "# \"credentials_file\": KEY_FILE}\n" ] }, { @@ -88,24 +171,24 @@ { "cell_type": "code", "execution_count": null, - "id": "4b42c055-2f60-4ac8-975d-f005e37afec0", + "id": "0a03e44e-12e9-4122-b06e-145defc74652", "metadata": {}, "outputs": [], "source": [ - "!gcloud auth application-default login" + "#!gcloud auth application-default login" ] }, { "cell_type": "code", "execution_count": null, - "id": "11e41457-303c-4d5e-ae0e-7015db33d9f7", + "id": "f175d952-c1cc-4793-aad5-0cdb5142b6fe", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", - "id": "88eb24b0-7d3b-4629-813c-fc989ba1ea90", + "id": "c2b02e8c-e919-4260-8979-d22f7ac86002", "metadata": {}, "source": [ "### Example 1: GQL Path Query to Graphistry Visualization of all nodes and edges (LIMIT optional) \n", @@ -116,7 +199,7 @@ "the query will not produce the results needed to properly load a graphistry graph. LIMIT is optional, but for large graphs with millions\n", " of edges or more, it's best to filter either in the query or use LIMIT so as not to exhaust GPU memory. \n", "\n", - "```python\n", + "```sql\n", "GRAPH FinGraph\n", "MATCH p = (a)-[b]->(c) where 1=1 LIMIT 10000 return SAFE_TO_JSON(p) as path\n", "```\n" @@ -125,7 +208,7 @@ { "cell_type": "code", "execution_count": null, - "id": "58ee08b2-29e1-47db-b0a8-440f6171e54d", + "id": "dd3c4b35-e8e1-43c7-bc66-d94b3d3e8451", "metadata": {}, "outputs": [], "source": [ @@ -138,7 +221,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3d606a3f-e807-4fa7-893e-52a95d238cc0", + "id": "0b544ab6-18b4-4fc9-adb1-3137bec62fb7", "metadata": {}, "outputs": [], "source": [ @@ -148,14 +231,14 @@ { "cell_type": "code", "execution_count": null, - "id": "ae275af9-f354-454b-bbeb-e423ae4acfba", + "id": "89a9f20b-8794-4d1d-8fd8-9f1e2c68f28d", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", - "id": "b9d18502-9bc1-4f7e-af3a-b99f7d848e08", + "id": "bc18a3ad-c8bc-48cf-8ef5-bc0013ae3fa0", "metadata": {}, "source": [ "#### Example 1.1 - inspect contents of graphistry graph (nodes and edges): " @@ -164,7 +247,7 @@ { "cell_type": "code", "execution_count": null, - "id": "42e451d7-8f97-45a8-bb45-7c4271f11f68", + "id": "936a4908-c62c-44cc-8b77-f56ebea5fca6", "metadata": {}, "outputs": [], "source": [ @@ -174,7 +257,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e5078632-24b2-4139-8e4f-a2c18f1efd94", + "id": "8bdb6728-324e-4da3-89f9-4be53fd1b50f", "metadata": {}, "outputs": [], "source": [ @@ -184,7 +267,7 @@ { "cell_type": "code", "execution_count": null, - "id": "376bf2a7-931a-4c3f-bfda-3413734e5ad7", + "id": "d4858f9f-6355-4a53-b1fd-a789c148bdab", "metadata": {}, "outputs": [], "source": [ @@ -194,21 +277,102 @@ { "cell_type": "code", "execution_count": null, - "id": "05d8b293-62b7-4056-b924-ff04284559b9", + "id": "4fe75ab3-ef30-4b32-ad0a-8cfcc2dfff45", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "d3a6150b-3513-4c3d-a696-2b1e9b273a42", + "metadata": {}, + "source": [ + "### Example 2: Quantified path traversal \n", + "(slightly modified from example to use a path query for visualization)\n", + "\n", + "from: [spanner-graph-getting-started](https://codelabs.developers.google.com/codelabs/spanner-graph-getting-started#6)\n", + "\n", + "**Query 2** - Quantified path traversal and return graph elements\n", + "\n", + "The following query matches all account money transfers starting from a source account with id=75 within 3 to 6 hops, to reach a destination account with id=199. The {3,6} syntax is used to represent a quantified 3 to 6 hop path traversal between src_accnt and dst_accnt.\n", + "\n", + "```sql\n", + "GRAPH FinGraph\n", + "MATCH\n", + "p = (src_accnt:Account {id:75})-[transfers:Transfers]->{3,6}\n", + " (dst_accnt:Account {id:199}) \n", + "RETURN SAFE_TO_JSON(p) as path\n", + "```\n", + "\n", + "Visually, you can think of the quantified edge traversal like below: it starts from a src_account node, and fetches all possible account transfer paths between 3 to 6 hops, to reach dst_account.\n", + "\n", + "The highlighted path at the bottom below, for example, is a 6-hop query.\n", + "\n", + "\"query2\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cb58e9ff-27e9-4241-a0c6-86bbe1879411", + "metadata": {}, + "outputs": [], + "source": [ + "query2='''GRAPH FinGraph\n", + "MATCH\n", + "p = (src_accnt:Account {id:75})-[transfers:Transfers]->{3,6}\n", + " (dst_accnt:Account {id:199}) where 1=1 \n", + "RETURN SAFE_TO_JSON(p) as path\n", + "'''" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fe4f9812-6105-4d22-999a-dea1e62a7820", + "metadata": {}, + "outputs": [], + "source": [ + "g2 = graphistry.spanner_gql_to_g(query2)\n", + "g2.plot()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "71f79ecb-faff-4d8d-a682-0ca332d73e57", + "metadata": {}, + "outputs": [], + "source": [ + "# now run again and retrive all the paths \n", + "query2a='''GRAPH FinGraph\n", + "MATCH\n", + "p = (src_accnt:Account )-[transfers:Transfers]->{3,6}\n", + " (dst_accnt:Account ) where 1=1 \n", + "RETURN SAFE_TO_JSON(p) as path\n", + "'''\n", + "g2a = graphistry.spanner_gql_to_g(query2a)\n", + "g2a.plot()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ce9220f7-19e2-4123-a036-d22c490b0187", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", - "id": "0013cceb-f32f-48a0-9f02-6b75a2704294", + "id": "1a6dbcab-a791-4b60-8d50-51c5e99b268f", "metadata": {}, "source": [ - "### Example 2: Spanner GQL Query to pandas dataframe (LIMIT optional) \n", + "### Example 3: Spanner GQL Tabular Query to pandas dataframe (LIMIT optional) \n", "\n", "This example shows a non-path query that returns tabular results, which are then convered to a dataframe for easy manipulation and inspection of the results. \n", "\n", - "```python\n", + "```sql\n", "GRAPH FinGraph \n", "MATCH (p:Person)-[]-()->(l:Loan)\n", "RETURN p.id as ID, p.name AS Name, SUM(l.loan_amount) AS TotalBorrowed\n", @@ -220,12 +384,12 @@ { "cell_type": "code", "execution_count": null, - "id": "523f57b2-d09f-4aa2-8626-5af74d5d9a20", + "id": "1379ce57-5c2a-4d20-94b1-2cfc6d2d6677", "metadata": {}, "outputs": [], "source": [ "query_top10='''GRAPH FinGraph \n", - "MATCH (p:Person)-[]-()->(l:Loan) WHERE 1=1\n", + "MATCH (p:Person)-[:Owns]-(:Account)->(l:Loan) WHERE 1=1\n", "RETURN p.id as ID, p.name AS Name, SUM(l.loan_amount) AS TotalBorrowed\n", "ORDER BY TotalBorrowed DESC\n", "LIMIT 10'''" @@ -234,7 +398,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a51917d3-6c17-4438-981d-b7d441ec89ec", + "id": "facacf92-7758-4796-9595-fc0d2230287c", "metadata": {}, "outputs": [], "source": [ @@ -244,7 +408,7 @@ { "cell_type": "code", "execution_count": null, - "id": "aab6fb57-6f09-41ae-927a-4f7b5f47db19", + "id": "d44f1a10-09c3-4407-9de1-01da06f54b2c", "metadata": {}, "outputs": [], "source": [ @@ -254,7 +418,129 @@ { "cell_type": "code", "execution_count": null, - "id": "7a55789a-16ed-4b6f-b2d0-e374999f1806", + "id": "a60a7792-9032-4ab4-951f-caeaf129701c", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "caee2321-a96c-4eeb-8166-5359b7a63687", + "metadata": {}, + "source": [ + "### Example 4: Spanner SQL Query to pandas dataframe \n", + "\n", + "This example shows a SQL query to Spanner that returns tabular results, which are then convered to a dataframe for easy manipulation and inspection of the results. \n", + "
\n", + "\n", + "#### Query: \n", + "```sql\n", + "SELECT * from Account\n", + "```\n", + "
\n", + "\n", + "\"Cloudblog\n", + "\n", + "\n", + "##### source: https://cloud.google.com/blog/products/databases/announcing-spanner-graph\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "feb8de69-56bc-45d3-a364-af4b5fdd4e42", + "metadata": {}, + "outputs": [], + "source": [ + "accounts_df = graphistry.spanner_query_to_df('SELECT * from Account')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "97308e97-8a5d-4546-9324-049b4a2af0a5", + "metadata": {}, + "outputs": [], + "source": [ + "accounts_df.head(10)" + ] + }, + { + "cell_type": "markdown", + "id": "892cd8dc-6b0d-4ef2-a8d7-49a7ab798355", + "metadata": {}, + "source": [ + "### Example 5: Spanner SQL Query to inspect the database schema\n", + "\n", + "This example shows a SQL query to Spanner that retrieves the tables, columns and types from the information schema in Spanner. This can be helpful for seeing what's available in the database or using this data as part of a workflow. \n", + "
\n", + "\n", + "```sql\n", + "SELECT table_name, column_name, spanner_type FROM information_schema.columns\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa101eb2-0098-4e62-ba3c-0a57ceba75ac", + "metadata": {}, + "outputs": [], + "source": [ + "columns_df = graphistry.spanner_query_to_df('SELECT table_name, column_name, spanner_type FROM information_schema.columns')\n", + "columns_df.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e293115a-9d39-43fb-8402-ce6642b8ab74", + "metadata": {}, + "outputs": [], + "source": [ + "len(columns_df.table_name.unique())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bc0a477d-f10e-45da-a102-7a178894351a", + "metadata": {}, + "outputs": [], + "source": [ + "query_tables='''\n", + "SELECT table_name, table_type\n", + "FROM information_schema.tables\n", + "WHERE table_catalog = ''\n", + " AND table_schema = ''\n", + " AND table_type IN ('BASE TABLE', 'VIEW');\n", + "'''\n", + "\n", + "tables_df = graphistry.spanner_query_to_df(query_tables)\n", + "tables_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7c340ea8-9dee-4540-8d97-f0644564ad4f", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "307eede5-8b8a-4286-83c0-0dfecbdbd38e", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "801f12ab-e3a1-4ad2-aba4-5c364010732b", "metadata": {}, "outputs": [], "source": [] From 8a2df08dd50b1cf59aac01bc4dd4a450b17c7e95 Mon Sep 17 00:00:00 2001 From: Thomas Cook Date: Fri, 17 Jan 2025 03:19:10 -0600 Subject: [PATCH 17/33] fix lint issues --- graphistry/PlotterBase.py | 5 ++--- graphistry/plugins/spannergraph.py | 4 ++-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/graphistry/PlotterBase.py b/graphistry/PlotterBase.py index ddc1ac5b09..8b6e72489f 100644 --- a/graphistry/PlotterBase.py +++ b/graphistry/PlotterBase.py @@ -2535,7 +2535,6 @@ def spanner_gql_to_g(self: Plottable, query: str) -> Plottable: g.plot() """ - from .pygraphistry import PyGraphistry res = copy.copy(self) @@ -2545,7 +2544,7 @@ def spanner_gql_to_g(self: Plottable, query: str) -> Plottable: if spanner_config is not None: logger.debug(f"Spanner Config: {spanner_config}") else: - logger.warn(f'PyGraphistry._config["spanner"] is None') + logger.warning('PyGraphistry._config["spanner"] is None') res = res.spanner_init(PyGraphistry._config["spanner"]) @@ -2596,7 +2595,7 @@ def spanner_query_to_df(self: Plottable, query: str) -> pd.DataFrame: if spanner_config is not None: logger.debug(f"Spanner Config: {spanner_config}") else: - logger.warn(f'PyGraphistry._config["spanner"] is None') + logger.warning('PyGraphistry._config["spanner"] is None') res = res.spanner_init(PyGraphistry._config["spanner"]) diff --git a/graphistry/plugins/spannergraph.py b/graphistry/plugins/spannergraph.py index 51323ad7e7..0676676d80 100644 --- a/graphistry/plugins/spannergraph.py +++ b/graphistry/plugins/spannergraph.py @@ -21,7 +21,7 @@ class SpannerQueryResult: :ivar list data: The raw query results. """ - def __init__(self, data: List[Any], column_names: List[str]=None): + def __init__(self, data: List[Any], column_names: List[str] = None): """ Initializes a SpannerQueryResult instance. @@ -45,7 +45,7 @@ class SpannerGraph: :ivar Any graphistry: The Graphistry parent object. """ - def __init__(self, g: Plottable, project_id: str, instance_id: str, database_id: str, credentials_file: str=None): + def __init__(self, g: Plottable, project_id: str, instance_id: str, database_id: str, credentials_file: str = None): """ Initializes the SpannerGraph instance. From 5e15cbed8a7e61043532f37b9ea7836ccd796248 Mon Sep 17 00:00:00 2001 From: Thomas Cook Date: Fri, 17 Jan 2025 11:56:53 -0600 Subject: [PATCH 18/33] fix linting errors --- graphistry/plugins/spannergraph.py | 8 ++++---- mypy.ini | 6 ++++++ 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/graphistry/plugins/spannergraph.py b/graphistry/plugins/spannergraph.py index 0676676d80..4e379e9402 100644 --- a/graphistry/plugins/spannergraph.py +++ b/graphistry/plugins/spannergraph.py @@ -21,14 +21,14 @@ class SpannerQueryResult: :ivar list data: The raw query results. """ - def __init__(self, data: List[Any], column_names: List[str] = None): + def __init__(self, data: List[Any], column_names: List[str]): """ Initializes a SpannerQueryResult instance. :param data: The raw query results. :type List[Any] :param column_names: a list of the column names from the cursor, defaults to None - :type: List[str], optional + :type: List[str] """ self.data = data self.column_names = column_names @@ -45,7 +45,7 @@ class SpannerGraph: :ivar Any graphistry: The Graphistry parent object. """ - def __init__(self, g: Plottable, project_id: str, instance_id: str, database_id: str, credentials_file: str = None): + def __init__(self, g: Plottable, project_id: str, instance_id: str, database_id: str, credentials_file: Optional[str] = None): """ Initializes the SpannerGraph instance. @@ -122,7 +122,7 @@ def convert_spanner_json(data: List[Any]) -> List[Dict[str, Any]]: json_list = [] for item in data: for elements in item: - json_entry = {"nodes": [], "edges": []} + json_entry: Dict[str, List] = {"nodes": [], "edges": []} for element in elements: element_dict_list = json.loads(element.serialize()) if isinstance(element, JsonObject) else element for element_dict in element_dict_list: diff --git a/mypy.ini b/mypy.ini index 0529d3134a..cf448f8b72 100644 --- a/mypy.ini +++ b/mypy.ini @@ -97,3 +97,9 @@ ignore_missing_imports = True [mypy-cuml.*] ignore_missing_imports = True + +[mypy-google.cloud.spanner_dbapi.connection] +ignore_missing_imports = True + +[mypy-google.cloud.spanner_v1.data_types] +ignore_missing_imports = True From 4a8e24474d06c20194d7c5ccab656fdd86413916 Mon Sep 17 00:00:00 2001 From: Thomas Cook Date: Fri, 17 Jan 2025 13:02:44 -0600 Subject: [PATCH 19/33] fix linting errors --- graphistry/plugins/spannergraph.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/graphistry/plugins/spannergraph.py b/graphistry/plugins/spannergraph.py index 4e379e9402..edd49d8eae 100644 --- a/graphistry/plugins/spannergraph.py +++ b/graphistry/plugins/spannergraph.py @@ -2,7 +2,7 @@ import pandas as pd import json import time -from typing import Any, List, Dict +from typing import Any, List, Dict, Optional from graphistry.Plottable import Plottable From fecc26009a9c1b5403a5f7f652c1935890124e3c Mon Sep 17 00:00:00 2001 From: Thomas Cook Date: Fri, 17 Jan 2025 16:33:28 -0600 Subject: [PATCH 20/33] fix more lint issue --- graphistry/Plottable.py | 3 ++- graphistry/PlotterBase.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/graphistry/Plottable.py b/graphistry/Plottable.py index 9f10f42f68..21aaea8300 100644 --- a/graphistry/Plottable.py +++ b/graphistry/Plottable.py @@ -62,7 +62,8 @@ class Plottable(object): _complex_encodings : dict _bolt_driver : Any _tigergraph : Any - + _spannergraph: SpannerGraph = None + _dataset_id: Optional[str] _url: Optional[str] _nodes_file_id: Optional[str] diff --git a/graphistry/PlotterBase.py b/graphistry/PlotterBase.py index f9b9367c45..ac38cd5eb8 100644 --- a/graphistry/PlotterBase.py +++ b/graphistry/PlotterBase.py @@ -178,7 +178,7 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: # Integrations self._bolt_driver : Any = None self._tigergraph : Any = None - self._spannergraph : Any = None + self._spannergraph: SpannerGraph = None # feature engineering self._node_embedding = None @@ -2546,7 +2546,7 @@ def spanner_gql_to_g(self: Plottable, query: str) -> Plottable: else: logger.warning('PyGraphistry._config["spanner"] is None') - res = res.spanner_init(PyGraphistry._config["spanner"]) + res = res.spanner_init(PyGraphistry._config["spanner"]) # type: ignore[attr-defined] return res._spannergraph.gql_to_graph(query) From ddbfc120700e3fe7ab6f1c83d295e04865cb5963 Mon Sep 17 00:00:00 2001 From: Thomas Cook Date: Fri, 17 Jan 2025 17:01:27 -0600 Subject: [PATCH 21/33] fix more lint issue --- graphistry/Plottable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/graphistry/Plottable.py b/graphistry/Plottable.py index 21aaea8300..795d8dbdfe 100644 --- a/graphistry/Plottable.py +++ b/graphistry/Plottable.py @@ -62,7 +62,7 @@ class Plottable(object): _complex_encodings : dict _bolt_driver : Any _tigergraph : Any - _spannergraph: SpannerGraph = None + _spannergraph: Any = None _dataset_id: Optional[str] _url: Optional[str] From 886f23d630d1c22b11f1561f78dec52cc0f7b3d0 Mon Sep 17 00:00:00 2001 From: Thomas Cook Date: Fri, 17 Jan 2025 17:03:07 -0600 Subject: [PATCH 22/33] fix more lint issue --- graphistry/PlotterBase.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/graphistry/PlotterBase.py b/graphistry/PlotterBase.py index ac38cd5eb8..4bafbc2e3f 100644 --- a/graphistry/PlotterBase.py +++ b/graphistry/PlotterBase.py @@ -2546,7 +2546,7 @@ def spanner_gql_to_g(self: Plottable, query: str) -> Plottable: else: logger.warning('PyGraphistry._config["spanner"] is None') - res = res.spanner_init(PyGraphistry._config["spanner"]) # type: ignore[attr-defined] + res = res.spanner_init(PyGraphistry._config["spanner"]) # type: ignore[attr-defined] return res._spannergraph.gql_to_graph(query) From 0d6d2f36aed09aa60956b47c129e97b08dda767f Mon Sep 17 00:00:00 2001 From: Thomas Cook Date: Fri, 17 Jan 2025 17:34:26 -0600 Subject: [PATCH 23/33] fix more lint issue --- graphistry/PlotterBase.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/graphistry/PlotterBase.py b/graphistry/PlotterBase.py index 4bafbc2e3f..cc4dae5233 100644 --- a/graphistry/PlotterBase.py +++ b/graphistry/PlotterBase.py @@ -178,7 +178,7 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: # Integrations self._bolt_driver : Any = None self._tigergraph : Any = None - self._spannergraph: SpannerGraph = None + self._spannergraph: Any = None # feature engineering self._node_embedding = None @@ -2597,7 +2597,7 @@ def spanner_query_to_df(self: Plottable, query: str) -> pd.DataFrame: else: logger.warning('PyGraphistry._config["spanner"] is None') - res = res.spanner_init(PyGraphistry._config["spanner"]) + res = res.spanner_init(PyGraphistry._config["spanner"]) # type: ignore[attr-defined] return res._spannergraph.query_to_df(query) From 63598fd79525d90850a719071c1316266df51972 Mon Sep 17 00:00:00 2001 From: Thomas Cook Date: Sun, 19 Jan 2025 19:23:05 -0600 Subject: [PATCH 24/33] updated notebook with CTA and other docs --- .../google_spanner_finance_graph.ipynb | 116 +++++++++++++++--- 1 file changed, 101 insertions(+), 15 deletions(-) diff --git a/demos/demos_databases_apis/spanner/google_spanner_finance_graph.ipynb b/demos/demos_databases_apis/spanner/google_spanner_finance_graph.ipynb index af9dee4587..3dce27ab84 100644 --- a/demos/demos_databases_apis/spanner/google_spanner_finance_graph.ipynb +++ b/demos/demos_databases_apis/spanner/google_spanner_finance_graph.ipynb @@ -50,6 +50,7 @@ "This demo is designed for:\n", "- **Data Scientists**: Interested in adding visual graph analytics to their toolkit.\n", "- **Database Engineers**: Looking to integrate graph capabilities into their Cloud Spanner workflows.\n", + "- **Application Developer**: Prototyping applications built \n", "- **Decision Makers**: Exploring actionable insights from complex datasets.\n", "\n", "#### Prerequisites\n", @@ -308,7 +309,9 @@ "\n", "The highlighted path at the bottom below, for example, is a 6-hop query.\n", "\n", - "\"query2\"\n" + "\"query2\"\n", + "\n", + "##### source: https://codelabs.developers.google.com/static/codelabs/spanner-graph-getting-started/#6\n" ] }, { @@ -530,20 +533,103 @@ "source": [] }, { - "cell_type": "code", - "execution_count": null, - "id": "307eede5-8b8a-4286-83c0-0dfecbdbd38e", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "801f12ab-e3a1-4ad2-aba4-5c364010732b", - "metadata": {}, - "outputs": [], - "source": [] + "cell_type": "markdown", + "id": "ddc8b403-a2d3-45d5-9eb7-c46d445ed179", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + "
\n", + "

Graphistry Resources

\n", + "
\n", + "\n", + "
\n", + " \n", + " \n", + "
\n", + "\n", + "
\n", + "

© 2025 Graphistry Resources. All rights reserved.

\n", + "
\n", + "\n", + "\n", + "" + ] } ], "metadata": { From c8941a759d8f9cd5260a6e07540f2b6990e2487a Mon Sep 17 00:00:00 2001 From: Thomas Cook Date: Sun, 19 Jan 2025 19:33:44 -0600 Subject: [PATCH 25/33] fix for readthedocs markdown --- .../google_spanner_finance_graph.ipynb | 33 +++++++++++-------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/demos/demos_databases_apis/spanner/google_spanner_finance_graph.ipynb b/demos/demos_databases_apis/spanner/google_spanner_finance_graph.ipynb index 3dce27ab84..c97e281726 100644 --- a/demos/demos_databases_apis/spanner/google_spanner_finance_graph.ipynb +++ b/demos/demos_databases_apis/spanner/google_spanner_finance_graph.ipynb @@ -25,7 +25,7 @@ "id": "814017ae-af4e-4331-b0b0-ed016d010ade", "metadata": {}, "source": [ - "## Demo Notebook - Graphistry and Google Spanner Graph \n", + "# Demo Notebook - Graphistry and Google Spanner Graph \n", "\n", "This interactive guide demonstrates how to combine the power of Graphistry's visual graph analytics and AI with the robust data capabilities of Google Cloud Spanner Graph. \n", "\n", @@ -43,15 +43,14 @@ "This notebook showcases:\n", "1. **Connecting to Cloud Spanner**: How to retrieve and preprocess data from Cloud Spanner for graph processing.\n", "2. **Graph Visualization with Graphistry**: Turning raw data into meaningful visualizations to explore relationships and clusters.\n", - "3. **Real-World Use Cases**: Applying these tools to solve practical problems such as anomaly detection, recommendation systems, and network optimization.\n", + "3. **Real-World Use Cases**: Applying these tools to solve practical problems \n", "\n", "#### Who Is This For?\n", "\n", "This demo is designed for:\n", "- **Data Scientists**: Interested in adding visual graph analytics to their toolkit.\n", "- **Database Engineers**: Looking to integrate graph capabilities into their Cloud Spanner workflows.\n", - "- **Application Developer**: Prototyping applications built \n", - "- **Decision Makers**: Exploring actionable insights from complex datasets.\n", + "- **Application Developer**: Prototyping applications built using Graphistry and Google Spanner \n", "\n", "#### Prerequisites\n", "\n", @@ -94,7 +93,7 @@ "id": "1407509d-f079-4f66-bc5f-bd93c9510d09", "metadata": {}, "source": [ - "### Settings" + "## Settings" ] }, { @@ -132,7 +131,7 @@ "id": "97649200-c7ab-4041-bb19-f7ab6363ead3", "metadata": {}, "source": [ - "### Graphistry register and gcloud init" + "## Graphistry register and gcloud init" ] }, { @@ -192,7 +191,7 @@ "id": "c2b02e8c-e919-4260-8979-d22f7ac86002", "metadata": {}, "source": [ - "### Example 1: GQL Path Query to Graphistry Visualization of all nodes and edges (LIMIT optional) \n", + "## Example 1: GQL Path Query to Graphistry Visualization of all nodes and edges (LIMIT optional) \n", "\n", "to extract the data from Spanner Graph as a graph with nodes and edges in a single object, a GQL path query is required. \n", "\n", @@ -242,7 +241,7 @@ "id": "bc18a3ad-c8bc-48cf-8ef5-bc0013ae3fa0", "metadata": {}, "source": [ - "#### Example 1.1 - inspect contents of graphistry graph (nodes and edges): " + "### Example 1.1 - inspect contents of graphistry graph (nodes and edges): " ] }, { @@ -288,7 +287,7 @@ "id": "d3a6150b-3513-4c3d-a696-2b1e9b273a42", "metadata": {}, "source": [ - "### Example 2: Quantified path traversal \n", + "## Example 2: Quantified path traversal \n", "(slightly modified from example to use a path query for visualization)\n", "\n", "from: [spanner-graph-getting-started](https://codelabs.developers.google.com/codelabs/spanner-graph-getting-started#6)\n", @@ -371,7 +370,7 @@ "id": "1a6dbcab-a791-4b60-8d50-51c5e99b268f", "metadata": {}, "source": [ - "### Example 3: Spanner GQL Tabular Query to pandas dataframe (LIMIT optional) \n", + "## Example 3: Spanner GQL Tabular Query to pandas dataframe (LIMIT optional) \n", "\n", "This example shows a non-path query that returns tabular results, which are then convered to a dataframe for easy manipulation and inspection of the results. \n", "\n", @@ -431,7 +430,7 @@ "id": "caee2321-a96c-4eeb-8166-5359b7a63687", "metadata": {}, "source": [ - "### Example 4: Spanner SQL Query to pandas dataframe \n", + "## Example 4: Spanner SQL Query to pandas dataframe \n", "\n", "This example shows a SQL query to Spanner that returns tabular results, which are then convered to a dataframe for easy manipulation and inspection of the results. \n", "
\n", @@ -474,7 +473,7 @@ "id": "892cd8dc-6b0d-4ef2-a8d7-49a7ab798355", "metadata": {}, "source": [ - "### Example 5: Spanner SQL Query to inspect the database schema\n", + "## Example 5: Spanner SQL Query to inspect the database schema\n", "\n", "This example shows a SQL query to Spanner that retrieves the tables, columns and types from the information schema in Spanner. This can be helpful for seeing what's available in the database or using this data as part of a workflow. \n", "
\n", @@ -628,8 +627,16 @@ "\n", "\n", "\n", - "" + "\n" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "14f4b9ed-5d25-479f-8ab2-cc23920df052", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { From 1611c0eac247a4f6b90c6e2c7da3b888ecb63be6 Mon Sep 17 00:00:00 2001 From: Thomas Cook Date: Wed, 22 Jan 2025 11:27:43 -0600 Subject: [PATCH 26/33] updates from PR comments --- graphistry/PlotterBase.py | 20 ++++---------------- graphistry/plugins/spannergraph.py | 22 ++++++++++++++++------ graphistry/pygraphistry.py | 22 ++++++++++++++++------ 3 files changed, 36 insertions(+), 28 deletions(-) diff --git a/graphistry/PlotterBase.py b/graphistry/PlotterBase.py index cc4dae5233..c167958ed6 100644 --- a/graphistry/PlotterBase.py +++ b/graphistry/PlotterBase.py @@ -178,7 +178,7 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: # Integrations self._bolt_driver : Any = None self._tigergraph : Any = None - self._spannergraph: Any = None + self._spannergraph: Any # feature engineering self._node_embedding = None @@ -2292,19 +2292,7 @@ def spanner_init(self: Plottable, spanner_config: Dict[str, str]) -> Plottable: """ res = copy.copy(self) - project_id = spanner_config["project_id"] - instance_id = spanner_config["instance_id"] - database_id = spanner_config["database_id"] - credentials_file = spanner_config["credentials_file"] - - # check if valid - required_keys = ["project_id", "instance_id", "database_id"] - for key in required_keys: - value = spanner_config.get(key) - if not value: # check for None or empty values - raise ValueError(f"Missing or invalid value for required Spanner configuration: '{key}'") - - res._spannergraph = SpannerGraph(res, project_id, instance_id, database_id, credentials_file) + res._spannergraph = SpannerGraph(res, spanner_config) logger.debug("Created SpannerGraph object: {res._spannergraph}") return res @@ -2548,7 +2536,7 @@ def spanner_gql_to_g(self: Plottable, query: str) -> Plottable: res = res.spanner_init(PyGraphistry._config["spanner"]) # type: ignore[attr-defined] - return res._spannergraph.gql_to_graph(query) + return res._spannergraph.gql_to_graph(self, query) def spanner_query_to_df(self: Plottable, query: str) -> pd.DataFrame: """ @@ -2599,7 +2587,7 @@ def spanner_query_to_df(self: Plottable, query: str) -> pd.DataFrame: res = res.spanner_init(PyGraphistry._config["spanner"]) # type: ignore[attr-defined] - return res._spannergraph.query_to_df(query) + return res._spannergraph.query_to_df(self, query) def nodexl(self, xls_or_url, source='default', engine=None, verbose=False): diff --git a/graphistry/plugins/spannergraph.py b/graphistry/plugins/spannergraph.py index edd49d8eae..f0c27ef0d1 100644 --- a/graphistry/plugins/spannergraph.py +++ b/graphistry/plugins/spannergraph.py @@ -45,7 +45,7 @@ class SpannerGraph: :ivar Any graphistry: The Graphistry parent object. """ - def __init__(self, g: Plottable, project_id: str, instance_id: str, database_id: str, credentials_file: Optional[str] = None): + def __init__(self, g: Plottable, spanner_config: Dict[str, str]): """ Initializes the SpannerGraph instance. @@ -54,11 +54,21 @@ def __init__(self, g: Plottable, project_id: str, instance_id: str, database_id: :param instance_id: The Spanner instance ID. :param database_id: The Spanner database ID. """ - self.g = g - self.project_id = project_id - self.instance_id = instance_id - self.database_id = database_id - self.credentials_file = credentials_file + + # check if valid + required_keys = ["project_id", "instance_id", "database_id"] + for key in required_keys: + value = spanner_config.get(key) + if not value: # check for None or empty values + raise ValueError(f"Missing or invalid value for required Spanner configuration: '{key}'") + + self.project_id = spanner_config["project_id"] + self.instance_id = spanner_config["instance_id"] + self.database_id = spanner_config["database_id"] + + if spanner_config.get("credentials_file"): + self.credentials_file = spanner_config["credentials_file"] + self.connection = self.__connect() def __connect(self) -> Any: diff --git a/graphistry/pygraphistry.py b/graphistry/pygraphistry.py index 7c1ef90662..020f1ae207 100644 --- a/graphistry/pygraphistry.py +++ b/graphistry/pygraphistry.py @@ -1077,12 +1077,21 @@ def bolt(driver=None): return Plotter().bolt(driver) @staticmethod - def spanner_init(spanner_config=None): + def spanner_init(spanner_config: Dict[str, str]) -> Plottable: """ + Initializes a SpannerGraph object with the provided configuration and connects to the instance db - TODO(tcook): update pydocs - :param spanner_config: a dict of project_id, instance_id and database_id for spanner connection - :return: Plotter w/spanner connection + spanner_config dict must contain the include the following keys, credentials_file is optional: + - "project_id": The GCP project ID. + - "instance_id": The Spanner instance ID. + - "database_id": The Spanner database ID. + - "credentials_file": json file API key for service accounts + + :param spanner_config A dictionary containing the Spanner configuration. + :type (Dict[str, str]) + :return: Plottable with a Spanner connection + :rtype: Plottable + :raises ValueError: If any of the required keys in `spanner_config` are missing or have invalid values. Call this to create a Plotter with a Spanner Graph Connection @@ -1096,6 +1105,7 @@ def spanner_init(spanner_config=None): """ if spanner_config is None: + logger.warn('spanner_init called with spanner_config with None type. Not connected.') return None else: return Plotter().spanner_init(spanner_config) @@ -1896,7 +1906,7 @@ def tigergraph( ) @staticmethod - def spanner_gql_to_g(query: str) -> Plottable: + def spanner_gql_to_g(self: Plottable, query: str) -> Plottable: """ Submit GQL query to google spanner graph database and return Plottable with nodes and edges populated @@ -1934,7 +1944,7 @@ def spanner_gql_to_g(query: str) -> Plottable: g.plot() """ - return Plotter().spanner_gql_to_g(query) + return self.spanner_gql_to_g(query) @staticmethod def spanner_query_to_df(query: str) -> pd.DataFrame: From 6491cdcb7e92912fd4a2ebaa8a2366f04dd1635d Mon Sep 17 00:00:00 2001 From: Thomas Cook Date: Wed, 22 Jan 2025 11:28:29 -0600 Subject: [PATCH 27/33] removed None assignment for _spannergraph - per PR comments --- graphistry/Plottable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/graphistry/Plottable.py b/graphistry/Plottable.py index 795d8dbdfe..9a978d4c81 100644 --- a/graphistry/Plottable.py +++ b/graphistry/Plottable.py @@ -62,7 +62,7 @@ class Plottable(object): _complex_encodings : dict _bolt_driver : Any _tigergraph : Any - _spannergraph: Any = None + _spannergraph: Any _dataset_id: Optional[str] _url: Optional[str] From b78c56b2c5ac697eaf49b1fe1eb24f06680d5fa0 Mon Sep 17 00:00:00 2001 From: Thomas Cook Date: Wed, 22 Jan 2025 13:00:37 -0600 Subject: [PATCH 28/33] changes to pass Plottable dynamically to SpannerGraph.gql_to_g --- graphistry/PlotterBase.py | 8 ++++---- graphistry/plugins/spannergraph.py | 9 ++++----- graphistry/pygraphistry.py | 4 ++-- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/graphistry/PlotterBase.py b/graphistry/PlotterBase.py index c167958ed6..4fc168f0fb 100644 --- a/graphistry/PlotterBase.py +++ b/graphistry/PlotterBase.py @@ -2527,7 +2527,7 @@ def spanner_gql_to_g(self: Plottable, query: str) -> Plottable: res = copy.copy(self) - if res._spannergraph is None: + if not hasattr(res, '_spannergraph'): spanner_config = PyGraphistry._config["spanner"] if spanner_config is not None: logger.debug(f"Spanner Config: {spanner_config}") @@ -2536,7 +2536,7 @@ def spanner_gql_to_g(self: Plottable, query: str) -> Plottable: res = res.spanner_init(PyGraphistry._config["spanner"]) # type: ignore[attr-defined] - return res._spannergraph.gql_to_graph(self, query) + return res._spannergraph.gql_to_graph(res, query) def spanner_query_to_df(self: Plottable, query: str) -> pd.DataFrame: """ @@ -2578,7 +2578,7 @@ def spanner_query_to_df(self: Plottable, query: str) -> pd.DataFrame: res = copy.copy(self) - if res._spannergraph is None: + if not hasattr(res, '_spannergraph'): spanner_config = PyGraphistry._config["spanner"] if spanner_config is not None: logger.debug(f"Spanner Config: {spanner_config}") @@ -2587,7 +2587,7 @@ def spanner_query_to_df(self: Plottable, query: str) -> pd.DataFrame: res = res.spanner_init(PyGraphistry._config["spanner"]) # type: ignore[attr-defined] - return res._spannergraph.query_to_df(self, query) + return res._spannergraph.query_to_df(query) def nodexl(self, xls_or_url, source='default', engine=None, verbose=False): diff --git a/graphistry/plugins/spannergraph.py b/graphistry/plugins/spannergraph.py index f0c27ef0d1..12098e36b9 100644 --- a/graphistry/plugins/spannergraph.py +++ b/graphistry/plugins/spannergraph.py @@ -240,7 +240,7 @@ def get_edges_df(json_data: list) -> pd.DataFrame: return SpannerGraph.add_type_from_label_to_df(edges_df) - def gql_to_graph(self, query: str) -> Plottable: + def gql_to_graph(self, res: Plottable, query: str) -> Plottable: """ Executes a query and constructs a Graphistry graph from the results. @@ -259,7 +259,7 @@ def gql_to_graph(self, query: str) -> Plottable: edges_df = self.get_edges_df(json_data) # TODO(tcook): add more error handling here if nodes or edges are empty - return self.g.nodes(nodes_df, 'identifier').edges(edges_df, 'source', 'destination') + return res.nodes(nodes_df, 'identifier').edges(edges_df, 'source', 'destination') def query_to_df(self, query: str) -> pd.DataFrame: """ @@ -272,6 +272,5 @@ def query_to_df(self, query: str) -> pd.DataFrame: query_result = self.execute_query(query) # create DataFrame from json results, adding column names - df = pd.DataFrame(query_result.data, columns=query_result.column_names) - - return df + return pd.DataFrame(query_result.data, columns=query_result.column_names) + \ No newline at end of file diff --git a/graphistry/pygraphistry.py b/graphistry/pygraphistry.py index 020f1ae207..0e0717897b 100644 --- a/graphistry/pygraphistry.py +++ b/graphistry/pygraphistry.py @@ -1906,7 +1906,7 @@ def tigergraph( ) @staticmethod - def spanner_gql_to_g(self: Plottable, query: str) -> Plottable: + def spanner_gql_to_g(query: str) -> Plottable: """ Submit GQL query to google spanner graph database and return Plottable with nodes and edges populated @@ -1944,7 +1944,7 @@ def spanner_gql_to_g(self: Plottable, query: str) -> Plottable: g.plot() """ - return self.spanner_gql_to_g(query) + return Plotter().spanner_gql_to_g(query) @staticmethod def spanner_query_to_df(query: str) -> pd.DataFrame: From 64cdc8dca74b46e565ad17c0d66cb38cca24a7f1 Mon Sep 17 00:00:00 2001 From: Thomas Cook Date: Wed, 22 Jan 2025 13:36:11 -0600 Subject: [PATCH 29/33] various PR review changes --- docs/source/plugins.rst | 1 + graphistry/PlotterBase.py | 7 +++++-- graphistry/__init__.py | 1 + graphistry/plugins/spannergraph.py | 4 ++-- graphistry/pygraphistry.py | 5 +++-- 5 files changed, 12 insertions(+), 6 deletions(-) diff --git a/docs/source/plugins.rst b/docs/source/plugins.rst index 9ab89e0844..36c3dcadc4 100644 --- a/docs/source/plugins.rst +++ b/docs/source/plugins.rst @@ -23,6 +23,7 @@ Graph * `Gremlin `_ (:class:`graphistry.gremlin.GremlinMixin`) * `Memgraph `_ (:meth:`graphistry.PlotterBase.PlotterBase.cypher`) * `Neo4j `_ (:meth:`graphistry.PlotterBase.PlotterBase.cypher`) +* `Google Spanner Graph `_ (:meth:`graphistry.PlotterBase.PlotterBase.spanner_gql_to_g`) * `TigerGraph `_ (:meth:`graphistry.PlotterBase.PlotterBase.gsql`) * `Trovares `_ diff --git a/graphistry/PlotterBase.py b/graphistry/PlotterBase.py index 4fc168f0fb..3e47c12e45 100644 --- a/graphistry/PlotterBase.py +++ b/graphistry/PlotterBase.py @@ -39,7 +39,7 @@ from .arrow_uploader import ArrowUploader from .nodexlistry import NodeXLGraphistry from .tigeristry import Tigeristry -from .plugins.spannergraph import SpannerGraph +#from .plugins.spannergraph import SpannerGraph from .util import setup_logger logger = setup_logger(__name__) @@ -2290,6 +2290,8 @@ def spanner_init(self: Plottable, spanner_config: Dict[str, str]) -> Plottable: :raises ValueError: If any of the required keys in `spanner_config` are missing or have invalid values. """ + from .plugins.spannergraph import SpannerGraph + res = copy.copy(self) res._spannergraph = SpannerGraph(res, spanner_config) @@ -2524,6 +2526,7 @@ def spanner_gql_to_g(self: Plottable, query: str) -> Plottable: """ from .pygraphistry import PyGraphistry + from .plugins.spannergraph import SpannerGraph res = copy.copy(self) @@ -2532,7 +2535,7 @@ def spanner_gql_to_g(self: Plottable, query: str) -> Plottable: if spanner_config is not None: logger.debug(f"Spanner Config: {spanner_config}") else: - logger.warning('PyGraphistry._config["spanner"] is None') + raise ValueError('spanner_config is None, use spanner_init() or register() passing spanner_config') res = res.spanner_init(PyGraphistry._config["spanner"]) # type: ignore[attr-defined] diff --git a/graphistry/__init__.py b/graphistry/__init__.py index 04e255de89..e471430eba 100644 --- a/graphistry/__init__.py +++ b/graphistry/__init__.py @@ -33,6 +33,7 @@ tigergraph, spanner_gql_to_g, spanner_query_to_df, + spanner_init, gsql, gsql_endpoint, cosmos, diff --git a/graphistry/plugins/spannergraph.py b/graphistry/plugins/spannergraph.py index 12098e36b9..c4ab9f0656 100644 --- a/graphistry/plugins/spannergraph.py +++ b/graphistry/plugins/spannergraph.py @@ -195,7 +195,7 @@ def get_nodes_df(json_data: list) -> pd.DataFrame: Converts spanner json nodes into a pandas DataFrame. :param json_data: The structured JSON data containing graph nodes. - :return: A DataFrame containing node information + :return: A DataFrame containing node data from Spanner, col names will match node properties. :rtype: pd.DataFrame """ nodes = [ @@ -220,7 +220,7 @@ def get_edges_df(json_data: list) -> pd.DataFrame: :param json_data: The structured JSON data containing graph edges. :type list - :return: A DataFrame containing edge information. + :return: A DataFrame containing edge data from Spanner, col names will match edge properties. :rtype: pd.DataFrame """ diff --git a/graphistry/pygraphistry.py b/graphistry/pygraphistry.py index 0e0717897b..32b701d8f1 100644 --- a/graphistry/pygraphistry.py +++ b/graphistry/pygraphistry.py @@ -571,8 +571,8 @@ def set_bolt_driver(driver=None): PyGraphistry._config["bolt_driver"] = bolt_util.to_bolt_driver(driver) @staticmethod - # def set_spanner_config(spanner_config: Optional[Union[Dict, Any] = None): - def set_spanner_config(spanner_config): + # def set_spanner_config(spanner_config): + def set_spanner_config(spanner_config: Optional[Union[Dict, str]] = None): """ Saves the spanner config to internal Pygraphistry _config :param spanner_config: dict of the project_id, instance_id and database_id @@ -2647,6 +2647,7 @@ def _handle_api_response(response): tigergraph = PyGraphistry.tigergraph spanner_gql_to_g = PyGraphistry.spanner_gql_to_g spanner_query_to_df = PyGraphistry.spanner_query_to_df +spanner_init = PyGraphistry.spanner_init cosmos = PyGraphistry.cosmos neptune = PyGraphistry.neptune gremlin = PyGraphistry.gremlin From bc71b2b3e2103a185ffa5ad7b4daf7465b0d0a7a Mon Sep 17 00:00:00 2001 From: Thomas Cook Date: Wed, 22 Jan 2025 15:49:21 -0600 Subject: [PATCH 30/33] fix lint error and add plot output back to notebook --- graphistry/plugins/spannergraph.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/graphistry/plugins/spannergraph.py b/graphistry/plugins/spannergraph.py index c4ab9f0656..b49af86a6a 100644 --- a/graphistry/plugins/spannergraph.py +++ b/graphistry/plugins/spannergraph.py @@ -273,4 +273,4 @@ def query_to_df(self, query: str) -> pd.DataFrame: # create DataFrame from json results, adding column names return pd.DataFrame(query_result.data, columns=query_result.column_names) - \ No newline at end of file + From cc0c88c40eff0694c7f1bd12a5182f73bb349fac Mon Sep 17 00:00:00 2001 From: Thomas Cook Date: Wed, 22 Jan 2025 15:49:31 -0600 Subject: [PATCH 31/33] fix lint error and add plot output back to notebook --- .../google_spanner_finance_graph.ipynb | 596 ++++++++++++++++-- 1 file changed, 553 insertions(+), 43 deletions(-) diff --git a/demos/demos_databases_apis/spanner/google_spanner_finance_graph.ipynb b/demos/demos_databases_apis/spanner/google_spanner_finance_graph.ipynb index c97e281726..db2591dec0 100644 --- a/demos/demos_databases_apis/spanner/google_spanner_finance_graph.ipynb +++ b/demos/demos_databases_apis/spanner/google_spanner_finance_graph.ipynb @@ -27,38 +27,29 @@ "source": [ "# Demo Notebook - Graphistry and Google Spanner Graph \n", "\n", - "This interactive guide demonstrates how to combine the power of Graphistry's visual graph analytics and AI with the robust data capabilities of Google Cloud Spanner Graph. \n", - "\n", - "#### Why Graphistry + Google Cloud Spanner?\n", - "\n", "Graphistry is a cutting-edge platform for large-scale visual graph exploration and analysis. It enables users to intuitively investigate complex relationships, patterns and anomalies across vast datasets through highly interactive, GPU-accelerated visualizations. Google Cloud Spanner, on the other hand, is a globally distributed, horizontally scalable, and strongly consistent database ideal for managing large, interconnected datasets.\n", "\n", - "Together, these technologies empower you to:\n", + "This interactive guide demonstrates how to combine the power of Graphistry's visual graph analytics and AI with the robust data capabilities of Google Cloud Spanner Graph. Together, these technologies empower you to:\n", + "\n", "- **Visualize Complex Graphs**: Easily explore relationships and uncover insights in your data through rich visual representations.\n", "- **Handle Large Datasets**: Leverage Cloud Spanner’s ability to manage vast amounts of interconnected information with strong consistency and scalability.\n", "- **Perform Advanced Analytics**: Apply graph-based algorithms and clustering techniques to extract actionable insights from structured data.\n", "\n", - "#### What Will You Learn?\n", + "This demo is designed for:\n", + "- **Data Scientists**: Interested in adding visual graph analytics to their toolkit.\n", + "- **Database Engineers**: Looking to integrate graph capabilities into their Cloud Spanner workflows.\n", + "- **Application Developer**: Prototyping applications built using Graphistry and Google Spanner \n", "\n", "This notebook showcases:\n", "1. **Connecting to Cloud Spanner**: How to retrieve and preprocess data from Cloud Spanner for graph processing.\n", "2. **Graph Visualization with Graphistry**: Turning raw data into meaningful visualizations to explore relationships and clusters.\n", "3. **Real-World Use Cases**: Applying these tools to solve practical problems \n", "\n", - "#### Who Is This For?\n", - "\n", - "This demo is designed for:\n", - "- **Data Scientists**: Interested in adding visual graph analytics to their toolkit.\n", - "- **Database Engineers**: Looking to integrate graph capabilities into their Cloud Spanner workflows.\n", - "- **Application Developer**: Prototyping applications built using Graphistry and Google Spanner \n", - "\n", "#### Prerequisites\n", "\n", - "To follow along, you’ll need:\n", "- A Google Cloud account with access to Cloud Spanner.\n", "- A Graphistry Enterprise Server or free-tier [Graphistry Hub account](https://www.graphistry.com/get-started) \n", - "- Python environment with Graphistry installed.\n", - "- Basic knowledge of SQL, GQL and graph concepts.\n", + "- Python environment with Graphistry and gcloud spanner support (see pip install below) \n", "- This demo is based on [FinGraph sample graph](https://codelabs.developers.google.com/codelabs/spanner-graph-getting-started#0)\n", "\n", "#### Let’s Get Started!\n", @@ -147,6 +138,8 @@ "# graphistry.register(api=3, username='...', password='...', protocol='https', server='hub.graphistry.com')\n", "# For more options, see https://pygraphistry.readthedocs.io/en/latest/server/register.html\n", "\n", + "import os \n", + "\n", "graphistry.register(api=3, \n", " protocol = \"http\", \n", " server = os.getenv(\"GRAPHISTRY_SERVER\"),\n", @@ -169,13 +162,11 @@ ] }, { - "cell_type": "code", - "execution_count": null, - "id": "0a03e44e-12e9-4122-b06e-145defc74652", + "cell_type": "markdown", + "id": "bb895e0d-b2a4-474e-aba9-46f791ec9600", "metadata": {}, - "outputs": [], "source": [ - "#!gcloud auth application-default login" + "### Google web-based auth below, only required if not using a credentials json file:" ] }, { @@ -184,7 +175,9 @@ "id": "f175d952-c1cc-4793-aad5-0cdb5142b6fe", "metadata": {}, "outputs": [], - "source": [] + "source": [ + "#!gcloud auth application-default login" + ] }, { "cell_type": "markdown", @@ -220,10 +213,38 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "0b544ab6-18b4-4fc9-adb1-3137bec62fb7", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "g.plot()" ] @@ -330,10 +351,38 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "id": "fe4f9812-6105-4d22-999a-dea1e62a7820", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "g2 = graphistry.spanner_gql_to_g(query2)\n", "g2.plot()" @@ -341,10 +390,38 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "id": "71f79ecb-faff-4d8d-a682-0ca332d73e57", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# now run again and retrive all the paths \n", "query2a='''GRAPH FinGraph\n", @@ -385,7 +462,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "id": "1379ce57-5c2a-4d20-94b1-2cfc6d2d6677", "metadata": {}, "outputs": [], @@ -399,7 +476,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "id": "facacf92-7758-4796-9595-fc0d2230287c", "metadata": {}, "outputs": [], @@ -409,10 +486,120 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "id": "d44f1a10-09c3-4407-9de1-01da06f54b2c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IDNameTotalBorrowed
0337Tutmarc15269003.6
1484Greiner14098853.6
2370Morrisseau13912146.2
3113Paakkonen13022928.4
4416Greif12713990.0
568Cabon12256398.8
666Stinson11462716.8
746Riby10772732.5
8406Jöhncke10470230.8
9169Gubenko10330528.5
\n", + "
" + ], + "text/plain": [ + " ID Name TotalBorrowed\n", + "0 337 Tutmarc 15269003.6\n", + "1 484 Greiner 14098853.6\n", + "2 370 Morrisseau 13912146.2\n", + "3 113 Paakkonen 13022928.4\n", + "4 416 Greif 12713990.0\n", + "5 68 Cabon 12256398.8\n", + "6 66 Stinson 11462716.8\n", + "7 46 Riby 10772732.5\n", + "8 406 Jöhncke 10470230.8\n", + "9 169 Gubenko 10330528.5" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "Top10_Borrowers_df.head(10)" ] @@ -450,7 +637,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "id": "feb8de69-56bc-45d3-a364-af4b5fdd4e42", "metadata": {}, "outputs": [], @@ -460,10 +647,131 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "id": "97308e97-8a5d-4546-9324-049b4a2af0a5", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idcreate_timeis_blockedtype
012020-01-10 14:22:20.222000+00:00Falsebrokerage account
122020-01-28 01:55:09.206000+00:00Falseprepaid card
232020-02-18 13:44:20.655000+00:00Falsebrokerage account
342020-02-29 16:49:53.902000+00:00Falsedebit card
452020-03-02 20:47:18.726000+00:00Falsebrokerage account
562020-03-21 22:25:34.327000+00:00Falsecustodial account
672020-04-14 00:53:48.932000+00:00Falsebrokerage account
782020-04-15 03:08:15.427000+00:00Truetrust account
892020-04-20 13:20:25.717000+00:00Falsecertificate of deposit
9102020-04-26 00:12:17.773000+00:00Falsedebit card
\n", + "
" + ], + "text/plain": [ + " id create_time is_blocked type\n", + "0 1 2020-01-10 14:22:20.222000+00:00 False brokerage account\n", + "1 2 2020-01-28 01:55:09.206000+00:00 False prepaid card\n", + "2 3 2020-02-18 13:44:20.655000+00:00 False brokerage account\n", + "3 4 2020-02-29 16:49:53.902000+00:00 False debit card\n", + "4 5 2020-03-02 20:47:18.726000+00:00 False brokerage account\n", + "5 6 2020-03-21 22:25:34.327000+00:00 False custodial account\n", + "6 7 2020-04-14 00:53:48.932000+00:00 False brokerage account\n", + "7 8 2020-04-15 03:08:15.427000+00:00 True trust account\n", + "8 9 2020-04-20 13:20:25.717000+00:00 False certificate of deposit\n", + "9 10 2020-04-26 00:12:17.773000+00:00 False debit card" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "accounts_df.head(10)" ] @@ -485,10 +793,120 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "id": "fa101eb2-0098-4e62-ba3c-0a57ceba75ac", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
table_namecolumn_namespanner_type
0AccountidINT64
1Accountcreate_timeTIMESTAMP
2Accountis_blockedBOOL
3AccounttypeSTRING(MAX)
4AccountAuditsidINT64
5AccountAuditsaudit_timestampTIMESTAMP
6AccountAuditsaudit_detailsSTRING(MAX)
7AccountRepayLoanidINT64
8AccountRepayLoanloan_idINT64
9AccountRepayLoanamountFLOAT64
\n", + "
" + ], + "text/plain": [ + " table_name column_name spanner_type\n", + "0 Account id INT64\n", + "1 Account create_time TIMESTAMP\n", + "2 Account is_blocked BOOL\n", + "3 Account type STRING(MAX)\n", + "4 AccountAudits id INT64\n", + "5 AccountAudits audit_timestamp TIMESTAMP\n", + "6 AccountAudits audit_details STRING(MAX)\n", + "7 AccountRepayLoan id INT64\n", + "8 AccountRepayLoan loan_id INT64\n", + "9 AccountRepayLoan amount FLOAT64" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "columns_df = graphistry.spanner_query_to_df('SELECT table_name, column_name, spanner_type FROM information_schema.columns')\n", "columns_df.head(10)" @@ -496,20 +914,112 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "id": "e293115a-9d39-43fb-8402-ce6642b8ab74", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "94" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "len(columns_df.table_name.unique())" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "id": "bc0a477d-f10e-45da-a102-7a178894351a", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
table_nametable_type
0AccountBASE TABLE
1AccountAuditsBASE TABLE
2AccountRepayLoanBASE TABLE
3AccountTransferAccountBASE TABLE
4LoanBASE TABLE
5PersonBASE TABLE
6PersonOwnAccountBASE TABLE
\n", + "
" + ], + "text/plain": [ + " table_name table_type\n", + "0 Account BASE TABLE\n", + "1 AccountAudits BASE TABLE\n", + "2 AccountRepayLoan BASE TABLE\n", + "3 AccountTransferAccount BASE TABLE\n", + "4 Loan BASE TABLE\n", + "5 Person BASE TABLE\n", + "6 PersonOwnAccount BASE TABLE" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "query_tables='''\n", "SELECT table_name, table_type\n", @@ -641,9 +1151,9 @@ ], "metadata": { "kernelspec": { - "display_name": "venv - spanner-test-1", + "display_name": "venv - spanner-test-5", "language": "python", - "name": "spanner-test-1" + "name": "spanner-test-5" }, "language_info": { "codemirror_mode": { From 1bbe1b11d313691415038daf1c8d707c8c65ab31 Mon Sep 17 00:00:00 2001 From: Thomas Cook Date: Wed, 22 Jan 2025 15:56:54 -0600 Subject: [PATCH 32/33] fix lint error about blank line at end of file --- graphistry/plugins/spannergraph.py | 1 - 1 file changed, 1 deletion(-) diff --git a/graphistry/plugins/spannergraph.py b/graphistry/plugins/spannergraph.py index b49af86a6a..7f29dbbf5c 100644 --- a/graphistry/plugins/spannergraph.py +++ b/graphistry/plugins/spannergraph.py @@ -273,4 +273,3 @@ def query_to_df(self, query: str) -> pd.DataFrame: # create DataFrame from json results, adding column names return pd.DataFrame(query_result.data, columns=query_result.column_names) - From c9dc546fde6af58400f304c669515978ba58924f Mon Sep 17 00:00:00 2001 From: Thomas Cook Date: Wed, 22 Jan 2025 16:21:43 -0600 Subject: [PATCH 33/33] remove stray comment --- graphistry/PlotterBase.py | 1 - 1 file changed, 1 deletion(-) diff --git a/graphistry/PlotterBase.py b/graphistry/PlotterBase.py index 3e47c12e45..f4ed0917a3 100644 --- a/graphistry/PlotterBase.py +++ b/graphistry/PlotterBase.py @@ -39,7 +39,6 @@ from .arrow_uploader import ArrowUploader from .nodexlistry import NodeXLGraphistry from .tigeristry import Tigeristry -#from .plugins.spannergraph import SpannerGraph from .util import setup_logger logger = setup_logger(__name__)