added spanner_query_to_df and other fixes from PR comments

graphistry · Jan 17, 2025 · 116334a · 116334a
1 parent f374279
commit 116334a
Show file tree

Hide file tree

Showing 5 changed files with 349 additions and 139 deletions.
diff --git a/demos/demos_databases_apis/spanner/google_spanner_finance_graph.ipynb b/demos/demos_databases_apis/spanner/google_spanner_finance_graph.ipynb
@@ -95,31 +95,50 @@
     "!gcloud auth application-default login"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "11e41457-303c-4d5e-ae0e-7015db33d9f7",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "markdown",
-   "id": "56bc01a7-76ea-44f6-b1cc-c5488c5c5922",
+   "id": "88eb24b0-7d3b-4629-813c-fc989ba1ea90",
    "metadata": {},
    "source": [
-    "### Spanner GQL Query to Graphistry Visualization"
+    "### Example 1: GQL Path Query to Graphistry Visualization of all nodes and edges (LIMIT optional) \n",
+    "\n",
+    "to extract the data from Spanner Graph as a graph with nodes and edges in a single object, a GQL path query is required. \n",
+    "\n",
+    "The format of a path query is as follows, note the p= at the start of the MATCH clause, and the SAFE_TO_JSON(p) without these, \n",
+    "the query will not produce the results needed to properly load a graphistry graph. LIMIT is optional, but for large graphs with millions\n",
+    "        of edges or more, it's best to filter either in the query or use LIMIT so as not to exhaust GPU memory.  \n",
+    "\n",
+    "```python\n",
+    "GRAPH FinGraph\n",
+    "MATCH p = (a)-[b]->(c) where 1=1 LIMIT 10000 return SAFE_TO_JSON(p) as path\n",
+    "```\n"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "e7ab1379-14bf-4c03-b5b2-28df22609029",
+   "id": "58ee08b2-29e1-47db-b0a8-440f6171e54d",
    "metadata": {},
    "outputs": [],
    "source": [
     "query=f'''GRAPH FinGraph\n",
-    "MATCH p = (a)-[b]->(c) where 1=1 {LIMIT_CLAUSE} return TO_JSON(p) as path'''\n",
+    "MATCH p = (a)-[b]->(c) where 1=1 {LIMIT_CLAUSE} return SAFE_TO_JSON(p) as path'''\n",
     "\n",
-    "g = graphistry.spanner_query(query)"
+    "g = graphistry.spanner_gql_to_g(query)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "aeead725-928a-44fe-b5b5-630c830502e0",
+   "id": "3d606a3f-e807-4fa7-893e-52a95d238cc0",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -129,23 +148,23 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "5f172d3e-108a-4a18-a88e-e012988013c5",
+   "id": "ae275af9-f354-454b-bbeb-e423ae4acfba",
    "metadata": {},
    "outputs": [],
    "source": []
   },
   {
    "cell_type": "markdown",
-   "id": "b40b6dae-0770-4839-a392-7cf16bee65d6",
+   "id": "b9d18502-9bc1-4f7e-af3a-b99f7d848e08",
    "metadata": {},
    "source": [
-    "#### inspect contents of graphistry graph (nodes and edges): "
+    "#### Example 1.1 - inspect contents of graphistry graph (nodes and edges): "
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "1fed46dd-2bb5-4563-9a21-920d299ba30d",
+   "id": "42e451d7-8f97-45a8-bb45-7c4271f11f68",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -155,7 +174,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "ad33fde2-3aae-4da0-8532-41d489c6fff1",
+   "id": "e5078632-24b2-4139-8e4f-a2c18f1efd94",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -165,7 +184,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "279f4cb9-daa1-4c2b-af69-85a5b0a771fb",
+   "id": "376bf2a7-931a-4c3f-bfda-3413734e5ad7",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -175,15 +194,67 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "2a60ee00-055d-4cee-b286-c5e3d6d85f09",
+   "id": "05d8b293-62b7-4056-b924-ff04284559b9",
    "metadata": {},
    "outputs": [],
    "source": []
   },
+  {
+   "cell_type": "markdown",
+   "id": "0013cceb-f32f-48a0-9f02-6b75a2704294",
+   "metadata": {},
+   "source": [
+    "### Example 2: Spanner GQL Query to pandas dataframe (LIMIT optional) \n",
+    "\n",
+    "This example shows a non-path query that returns tabular results, which are then convered to a dataframe for easy manipulation and inspection of the results. \n",
+    "\n",
+    "```python\n",
+    "GRAPH FinGraph \n",
+    "MATCH (p:Person)-[]-()->(l:Loan)\n",
+    "RETURN p.id as ID, p.name AS Name, SUM(l.loan_amount) AS TotalBorrowed\n",
+    "ORDER BY TotalBorrowed DESC\n",
+    "LIMIT 10```\n",
+    "\n"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "11e41457-303c-4d5e-ae0e-7015db33d9f7",
+   "id": "523f57b2-d09f-4aa2-8626-5af74d5d9a20",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "query_top10='''GRAPH FinGraph \n",
+    "MATCH (p:Person)-[]-()->(l:Loan) WHERE 1=1\n",
+    "RETURN p.id as ID, p.name AS Name, SUM(l.loan_amount) AS TotalBorrowed\n",
+    "ORDER BY TotalBorrowed DESC\n",
+    "LIMIT 10'''"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a51917d3-6c17-4438-981d-b7d441ec89ec",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "Top10_Borrowers_df = graphistry.spanner_query_to_df(query_top10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aab6fb57-6f09-41ae-927a-4f7b5f47db19",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "Top10_Borrowers_df.head(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7a55789a-16ed-4b6f-b2d0-e374999f1806",
    "metadata": {},
    "outputs": [],
    "source": []

diff --git a/graphistry/PlotterBase.py b/graphistry/PlotterBase.py
@@ -2273,22 +2273,38 @@ def bolt(self, driver):
         res._bolt_driver = to_bolt_driver(driver)
         return res
 
-    # TODO(tcook): add pydocs, typing 
-    def spanner_init(self, spanner_config):
+    def spanner_init(self: Plottable, spanner_config: Dict[str, str]) -> Plottable:
+        """
+        Initializes a SpannerGraph object with the provided configuration and connects to the instance db
+
+        spanner_config dict must contain the include the following keys, credentials_file is optional:
+            - "project_id": The GCP project ID.
+            - "instance_id": The Spanner instance ID.
+            - "database_id": The Spanner database ID.
+            - "credentials_file": json file API key for service accounts 
+
+        :param spanner_config A dictionary containing the Spanner configuration. 
+        :type (Dict[str, str])
+        :return: Plottable with a Spanner connection 
+        :rtype: Plottable
+        :raises ValueError: If any of the required keys in `spanner_config` are missing or have invalid values.
+
+        """
         res = copy.copy(self)
 
         project_id = spanner_config["project_id"]
         instance_id = spanner_config["instance_id"]
         database_id = spanner_config["database_id"]
+        credentials_file = spanner_config["credentials_file"]
 
         # check if valid 
         required_keys = ["project_id", "instance_id", "database_id"]
         for key in required_keys:
             value = spanner_config.get(key)
-            if not value:  # checks for None or empty values
+            if not value:  # check for None or empty values
                 raise ValueError(f"Missing or invalid value for required Spanner configuration: '{key}'")
 
-        res._spannergraph = SpannerGraph(res, project_id, instance_id, database_id)
+        res._spannergraph = SpannerGraph(res, project_id, instance_id, database_id, credentials_file)
         logger.debug("Created SpannerGraph object: {res._spannergraph}")
         return res
 
@@ -2481,28 +2497,91 @@ def cypher(self, query: str, params: Dict[str, Any] = {}) -> Plottable:
             .edges(edges)
 
 
-    def spanner_query(self, query: str, params: Dict[str, Any] = {}) -> Plottable:
+    def spanner_gql_to_g(self: Plottable, query: str) -> Plottable:
         """
-        TODO(tcook):  maybe rename to spanner_query_gql since spanner supports multiple languages. SQL, GQL, etc
+        Submit GQL query to google spanner graph database and return Plottable with nodes and edges populated  
+        
+        GQL must be a path query with a syntax similar to the following, it's recommended to return the path with
+        SAFE_TO_JSON(p), TO_JSON() can also be used, but not recommend. LIMIT is optional, but for large graphs with millions
+        of edges or more, it's best to filter either in the query or use LIMIT so as not to exhaust GPU memory.  
+
+        query=f'''GRAPH my_graph
+        MATCH p = (a)-[b]->(c) LIMIT 100000 return SAFE_TO_JSON(p) as path'''
 
-        query google spanner graph database and return Plottable with nodes and edges populated  
         :param query: GQL query string 
         :type query: Str
+
         :returns: Plottable with the results of GQL query as a graph
         :rtype: Plottable
 
-        **Example: calling spanner_query
+        **Example: calling spanner_gql_to_g
+                ::
+
+                    import graphistry
+
+                    # credentials_file is optional, all others are required
+                    SPANNER_CONF = { "project_id":  PROJECT_ID,                 
+                                     "instance_id": INSTANCE_ID, 
+                                     "database_id": DATABASE_ID, 
+                                     "credentials_file": CREDENTIALS_FILE }
+
+                    graphistry.register(..., spanner_config=SPANNER_CONF)
+
+                    query=f'''GRAPH my_graph
+                    MATCH p = (a)-[b]->(c) LIMIT 100000 return SAFE_TO_JSON(p) as path'''
+
+                    g = graphistry.spanner_gql_to_g(query)
+
+                    g.plot()
+     
+        """
+
+        from .pygraphistry import PyGraphistry
+
+        res = copy.copy(self)
+
+        if res._spannergraph is None: 
+            spanner_config = PyGraphistry._config["spanner"]
+            if spanner_config is not None: 
+                logger.debug(f"Spanner Config: {spanner_config}")
+            else: 
+                logger.warn(f'PyGraphistry._config["spanner"] is None')
+
+            res = res.spanner_init(PyGraphistry._config["spanner"])
+
+        return res._spannergraph.gql_to_graph(query)
+
+    def spanner_query_to_df(self: Plottable, query: str) -> pd.DataFrame:
+        """
+
+        Submit query to google spanner database and return a df of the results 
+        
+        query can be SQL or GQL as long as table of results are returned 
+
+        query='SELECT * from Account limit 10000'
+
+        :param query: query string 
+        :type query: Str
+
+        :returns: Pandas DataFrame with the results of query
+        :rtype: pd.DataFrame
+
+        **Example: calling spanner_query_to_df
                 ::
 
                     import graphistry
 
-                    SPANNER_CONF = { "project_id":  PROJECT_ID, 
+                    # credentials_file is optional, all others are required
+                    SPANNER_CONF = { "project_id":  PROJECT_ID,                 
                                      "instance_id": INSTANCE_ID, 
-                                     "database_id": DATABASE_ID }
+                                     "database_id": DATABASE_ID, 
+                                     "credentials_file": CREDENTIALS_FILE }
 
                     graphistry.register(..., spanner_config=SPANNER_CONF)
 
-                    g = graphistry.spanner_query("Graph MyGraph\nMATCH ()-[]->()" )
+                    query='SELECT * from Account limit 10000'
+
+                    df = graphistry.spanner_query_to_df(query)
 
                     g.plot()
      
@@ -2517,12 +2596,11 @@ def spanner_query(self, query: str, params: Dict[str, Any] = {}) -> Plottable:
             if spanner_config is not None: 
                 logger.debug(f"Spanner Config: {spanner_config}")
             else: 
-                logger.debug(f'PyGraphistry._config["spanner"] is None')
+                logger.warn(f'PyGraphistry._config["spanner"] is None')
 
             res = res.spanner_init(PyGraphistry._config["spanner"])
-            return res._spannergraph.gql_to_graph(query)
-        else: 
-            return res._spannergraph.gql_to_graph(query)
+
+        return res._spannergraph.query_to_df(query)
 
 
     def nodexl(self, xls_or_url, source='default', engine=None, verbose=False):

diff --git a/graphistry/__init__.py b/graphistry/__init__.py
@@ -31,7 +31,8 @@
     bolt,
     cypher,
     tigergraph,
-    spanner_query,
+    spanner_gql_to_g,
+    spanner_query_to_df,
     gsql,
     gsql_endpoint,
     cosmos,