-
Notifications
You must be signed in to change notification settings - Fork 37
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
#42 Adding a new Jupyter notebook example that uses a Jupyter notebook and the Trino Python clien.
- Loading branch information
Showing
2 changed files
with
322 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,312 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"id": "98c657d7-ef2b-49cf-99de-9a307b4f5d93", | ||
"metadata": {}, | ||
"source": [ | ||
"## Gravitino Trino Example\n", | ||
"\n", | ||
"In this example, we will use `Jupyter` and the `Trino Python Client` to experience `Gravitino`." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "a0dbfc33-8811-4b8e-a43b-24c0a4c934af", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# install trino python client and pandas\n", | ||
"%pip install trino pandas" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "fb9e0bda-c0bf-4aaf-9094-bb14649e688c", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from trino.dbapi import connect\n", | ||
"\n", | ||
"# Create a Trino connector client\n", | ||
"conn = connect(\n", | ||
" host=\"trino\",\n", | ||
" port=8080,\n", | ||
" user=\"admin\",\n", | ||
" catalog=\"catalog_hive\",\n", | ||
" schema=\"http\",\n", | ||
")\n", | ||
"\n", | ||
"trino_client = conn.cursor()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "dd240dfe-f4a9-4fa0-8366-82a8df34273d", | ||
"metadata": {}, | ||
"source": [ | ||
"## Prepare\n", | ||
"\n", | ||
"Creates a schema named `catalog_hive.company` in Hive, with its location set to`hdfs://hive:9000/user/hive/warehouse/company.db` on HDFS." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "85eea564-cb4f-476d-a373-fc71b5086f24", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"trino_client.execute(\"\"\"\n", | ||
"CREATE SCHEMA catalog_hive.company\n", | ||
" WITH (location = 'hdfs://hive:9000/user/hive/warehouse/company.db')\n", | ||
"\"\"\").fetchall()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "d4015b50-470c-47c0-b74f-6d8268cd6d1c", | ||
"metadata": {}, | ||
"source": [ | ||
"Displays the SQL command that was used to create the schema `catalog_hive.company`." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "c9424567-a5a9-4b0a-9a43-82a088812a79", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"trino_client.execute(\"\"\"\n", | ||
"SHOW CREATE SCHEMA catalog_hive.company\n", | ||
"\"\"\").fetchall()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "96339930-dbab-46d8-955e-25917c2ea5e4", | ||
"metadata": {}, | ||
"source": [ | ||
"Create `employees` table" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "46eda853-bf85-4c06-96b7-5b09d08141db", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Create Table\n", | ||
"trino_client.execute(\n", | ||
"\"\"\"\n", | ||
"CREATE TABLE catalog_hive.company.employees\n", | ||
"(\n", | ||
" name varchar,\n", | ||
" salary decimal(10,2)\n", | ||
")\n", | ||
"WITH (\n", | ||
" format = 'TEXTFILE'\n", | ||
")\n", | ||
"\"\"\"\n", | ||
").fetchall()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "52921a17-d50f-42b1-870c-523efba622f7", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Insert data\n", | ||
"print(trino_client.execute(\"INSERT INTO catalog_hive.company.employees (name, salary) VALUES ('Sam Evans', 55000)\").fetchall())" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "c8b5c0f8-2650-4e0f-85d3-1dfd396cee21", | ||
"metadata": {}, | ||
"source": [ | ||
"## Simple queries\n", | ||
"\n", | ||
"Some simple query testing." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "65b02105-c530-431a-8302-3ce8ff1a6463", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import pandas as pd\n", | ||
"\n", | ||
"# Show employees table contents\n", | ||
"df = pd.DataFrame(trino_client.execute(\"SELECT * FROM catalog_hive.company.employees\").fetchall(), columns=['Name', 'Salary'])\n", | ||
"\n", | ||
"# Display the DataFrame\n", | ||
"df" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "ebb7f9fc-8ae8-4298-b49d-7f7b06b6b8c3", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Execute the queries and convert the results directly to DataFrames\n", | ||
"df_g = pd.DataFrame(trino_client.execute(\"SHOW SCHEMAS from catalog_hive\").fetchall(), columns=['Schema'])\n", | ||
"df_g" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "de3556c5-35b8-4ad2-8c67-4f1e985513c3", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"h = trino_client.execute(\"DESCRIBE catalog_hive.company.employees\").fetchall()\n", | ||
"h" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "cae3b62f-5ad8-4959-8023-afd758e4103e", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"df_i = pd.DataFrame(trino_client.execute(\"SHOW TABLES from catalog_hive.company\").fetchall(), columns=['Tables'])\n", | ||
"df_i" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "3e00c989-607e-4652-8288-54e2629ac62d", | ||
"metadata": {}, | ||
"source": [ | ||
"## Cross-catalog queries\n", | ||
"\n", | ||
"In a company, there may be different departments using different data stacks. In this example, the HR department uses Apache Hive to store its data and the sales department uses PostgreSQL. You can run some interesting queries by joining the two departments' data together with Gravitino.\n", | ||
"\n", | ||
"To know which employee has the largest sales amount:" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "83af02a6-526a-44f8-b99d-54721b3cd05d", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Cross-catalog queries\n", | ||
"cross_catalog = trino_client.execute(\"\"\"\n", | ||
"SELECT given_name, family_name, job_title, sum(total_amount) AS total_sales\n", | ||
"FROM catalog_hive.sales.sales as s,\n", | ||
" catalog_postgres.hr.employees AS e\n", | ||
"where s.employee_id = e.employee_id\n", | ||
"GROUP BY given_name, family_name, job_title\n", | ||
"ORDER BY total_sales DESC\n", | ||
"LIMIT 1\n", | ||
"\"\"\").fetchall()\n", | ||
"\n", | ||
"# Convert the result to a DataFrame\n", | ||
"df_j = pd.DataFrame(cross_catalog, columns=['Given Name', 'Family Name', 'Job Title', 'Total Sales'])\n", | ||
"\n", | ||
"df_j" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "b92fcfb4-c6c9-4fff-9193-6562467996b7", | ||
"metadata": {}, | ||
"source": [ | ||
"To know the top customers who bought the most by state:" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "ff956ab9-60ef-485b-8e75-f907ed1707c2", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Execute the query\n", | ||
"k = trino_client.execute(\"\"\"\n", | ||
"SELECT customer_name, location, SUM(total_amount) AS total_spent\n", | ||
"FROM catalog_hive.sales.sales AS s,\n", | ||
" catalog_hive.sales.stores AS l,\n", | ||
" catalog_hive.sales.customers AS c\n", | ||
"WHERE s.store_id = l.store_id AND s.customer_id = c.customer_id\n", | ||
"GROUP BY location, customer_name\n", | ||
"ORDER BY location, SUM(total_amount) DESC\n", | ||
"\"\"\").fetchall()\n", | ||
"\n", | ||
"# Convert the result to a DataFrame\n", | ||
"df_k = pd.DataFrame(k, columns=['Customer Name', 'Location', 'Total Spent'])\n", | ||
"\n", | ||
"# Display the DataFrame\n", | ||
"df_k" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "c2369ac9-d92c-4d35-8753-11c5c2cb0f47", | ||
"metadata": {}, | ||
"source": [ | ||
"To know the employee's average performance rating and total sales:" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "d9d6e4a4-d913-4d4d-9153-e25fd4df3f68", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Execute the query\n", | ||
"l = trino_client.execute(\"\"\"\n", | ||
"SELECT e.employee_id, given_name, family_name, AVG(rating) AS average_rating, SUM(total_amount) AS total_sales\n", | ||
"FROM catalog_postgres.hr.employees AS e,\n", | ||
" catalog_postgres.hr.employee_performance AS p,\n", | ||
" catalog_hive.sales.sales AS s\n", | ||
"WHERE e.employee_id = p.employee_id AND p.employee_id = s.employee_id\n", | ||
"GROUP BY e.employee_id, given_name, family_name\n", | ||
"\"\"\").fetchall()\n", | ||
"\n", | ||
"# Convert the result to a DataFrame\n", | ||
"df_l = pd.DataFrame(l, columns=['Employee ID', 'Given Name', 'Family Name', 'Average Rating', 'Total Sales'])\n", | ||
"\n", | ||
"# Display the DataFrame\n", | ||
"df_l" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.11.7" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |