awslabs · komashk · Jun 17, 2024 · Jun 17, 2024 · Aug 16, 2024 · Aug 16, 2024
diff --git a/analyzers.ipynb b/analyzers.ipynb
@@ -0,0 +1,397 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Analyzers Basic Tutorial\n",
+    "\n",
+    "__Updated June 2024 to use a new dataset__\n",
+    "\n",
+    "This Jupyter notebook will give a basic tutorial on how to use PyDeequ's Analyzers module."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "# indicate your Spark version, here we use Spark 3.5 with pydeequ 1.4.0\n",
+    "os.environ[\"SPARK_VERSION\"] = '3.5'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      ":: loading settings :: url = jar:file:/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Ivy Default Cache set to: /home/ec2-user/.ivy2/cache\n",
+      "The jars for the packages stored in: /home/ec2-user/.ivy2/jars\n",
+      "com.amazon.deequ#deequ added as a dependency\n",
+      ":: resolving dependencies :: org.apache.spark#spark-submit-parent-23421fea-77b3-4d69-9251-54adf6371fd9;1.0\n",
+      "\tconfs: [default]\n",
+      "\tfound com.amazon.deequ#deequ;2.0.3-spark-3.3 in central\n",
+      "\tfound org.scala-lang#scala-reflect;2.12.10 in central\n",
+      "\tfound org.scalanlp#breeze_2.12;0.13.2 in central\n",
+      "\tfound org.scalanlp#breeze-macros_2.12;0.13.2 in central\n",
+      "\tfound com.github.fommil.netlib#core;1.1.2 in central\n",
+      "\tfound net.sf.opencsv#opencsv;2.3 in central\n",
+      "\tfound com.github.rwl#jtransforms;2.4.0 in central\n",
+      "\tfound junit#junit;4.8.2 in central\n",
+      "\tfound org.apache.commons#commons-math3;3.2 in central\n",
+      "\tfound org.spire-math#spire_2.12;0.13.0 in central\n",
+      "\tfound org.spire-math#spire-macros_2.12;0.13.0 in central\n",
+      "\tfound org.typelevel#machinist_2.12;0.6.1 in central\n",
+      "\tfound com.chuusai#shapeless_2.12;2.3.2 in central\n",
+      "\tfound org.typelevel#macro-compat_2.12;1.1.1 in central\n",
+      "\tfound org.slf4j#slf4j-api;1.7.5 in central\n",
+      ":: resolution report :: resolve 435ms :: artifacts dl 12ms\n",
+      "\t:: modules in use:\n",
+      "\tcom.amazon.deequ#deequ;2.0.3-spark-3.3 from central in [default]\n",
+      "\tcom.chuusai#shapeless_2.12;2.3.2 from central in [default]\n",
+      "\tcom.github.fommil.netlib#core;1.1.2 from central in [default]\n",
+      "\tcom.github.rwl#jtransforms;2.4.0 from central in [default]\n",
+      "\tjunit#junit;4.8.2 from central in [default]\n",
+      "\tnet.sf.opencsv#opencsv;2.3 from central in [default]\n",
+      "\torg.apache.commons#commons-math3;3.2 from central in [default]\n",
+      "\torg.scala-lang#scala-reflect;2.12.10 from central in [default]\n",
+      "\torg.scalanlp#breeze-macros_2.12;0.13.2 from central in [default]\n",
+      "\torg.scalanlp#breeze_2.12;0.13.2 from central in [default]\n",
+      "\torg.slf4j#slf4j-api;1.7.5 from central in [default]\n",
+      "\torg.spire-math#spire-macros_2.12;0.13.0 from central in [default]\n",
+      "\torg.spire-math#spire_2.12;0.13.0 from central in [default]\n",
+      "\torg.typelevel#machinist_2.12;0.6.1 from central in [default]\n",
+      "\torg.typelevel#macro-compat_2.12;1.1.1 from central in [default]\n",
+      "\t:: evicted modules:\n",
+      "\torg.scala-lang#scala-reflect;2.12.1 by [org.scala-lang#scala-reflect;2.12.10] in [default]\n",
+      "\torg.scala-lang#scala-reflect;2.12.0 by [org.scala-lang#scala-reflect;2.12.10] in [default]\n",
+      "\t---------------------------------------------------------------------\n",
+      "\t|                  |            modules            ||   artifacts   |\n",
+      "\t|       conf       | number| search|dwnlded|evicted|| number|dwnlded|\n",
+      "\t---------------------------------------------------------------------\n",
+      "\t|      default     |   17  |   0   |   0   |   2   ||   15  |   0   |\n",
+      "\t---------------------------------------------------------------------\n",
+      ":: retrieving :: org.apache.spark#spark-submit-parent-23421fea-77b3-4d69-9251-54adf6371fd9\n",
+      "\tconfs: [default]\n",
+      "\t0 artifacts copied, 15 already retrieved (0kB/9ms)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "24/06/14 23:25:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Setting default log level to \"WARN\".\n",
+      "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "24/06/14 23:25:59 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from pyspark.sql import SparkSession, Row, DataFrame\n",
+    "import json\n",
+    "import pandas as pd\n",
+    "import sagemaker_pyspark\n",
+    "\n",
+    "import pydeequ\n",
+    "\n",
+    "classpath = \":\".join(sagemaker_pyspark.classpath_jars())\n",
+    "\n",
+    "spark = (SparkSession\n",
+    "    .builder\n",
+    "    .config(\"spark.driver.extraClassPath\", classpath)\n",
+    "    .config(\"spark.jars.packages\", pydeequ.deequ_maven_coord)\n",
+    "    .config(\"spark.jars.excludes\", pydeequ.f2j_maven_coord)\n",
+    "    .getOrCreate())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### We will be using the synthetic reviews dataset for Electronics products"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "24/06/14 23:26:01 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "root\n",
+      " |-- marketplace: string (nullable = true)\n",
+      " |-- customer_id: string (nullable = true)\n",
+      " |-- review_id: string (nullable = true)\n",
+      " |-- product_id: string (nullable = true)\n",
+      " |-- product_title: string (nullable = true)\n",
+      " |-- star_rating: long (nullable = true)\n",
+      " |-- helpful_votes: long (nullable = true)\n",
+      " |-- total_votes: long (nullable = true)\n",
+      " |-- insight: string (nullable = true)\n",
+      " |-- review_headline: string (nullable = true)\n",
+      " |-- review_body: string (nullable = true)\n",
+      " |-- review_date: timestamp (nullable = true)\n",
+      " |-- review_year: long (nullable = true)\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "df = spark.read.parquet(\"s3a://aws-bigdata-blog/generated_synthetic_reviews/data/product_category=Electronics/\")\n",
+    "\n",
+    "df.printSchema()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "24/06/14 23:26:06 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+-----------+--------------------+-------------------+--------------------+\n",
+      "|     entity|            instance|               name|               value|\n",
+      "+-----------+--------------------+-------------------+--------------------+\n",
+      "|     Column|           review_id|       Completeness|                 1.0|\n",
+      "|     Column|           review_id|ApproxCountDistinct|           3160409.0|\n",
+      "|Mutlicolumn|total_votes,star_...|        Correlation|-7.38808965018615...|\n",
+      "|    Dataset|                   *|               Size|           3010972.0|\n",
+      "|     Column|         star_rating|               Mean|  3.9999973430506826|\n",
+      "|     Column|     top star_rating|         Compliance|  0.7499993357626706|\n",
+      "|Mutlicolumn|total_votes,helpf...|        Correlation|  0.9817922803462663|\n",
+      "+-----------+--------------------+-------------------+--------------------+\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/pyspark/sql/dataframe.py:127: UserWarning: DataFrame constructor is internal. Do not directly use it.\n",
+      "  warnings.warn(\"DataFrame constructor is internal. Do not directly use it.\")\n"
+     ]
+    }
+   ],
+   "source": [
+    "from pydeequ.analyzers import *\n",
+    "\n",
+    "analysisResult = AnalysisRunner(spark) \\\n",
+    "                    .onData(df) \\\n",
+    "                    .addAnalyzer(Size()) \\\n",
+    "                    .addAnalyzer(Completeness(\"review_id\")) \\\n",
+    "                    .addAnalyzer(ApproxCountDistinct(\"review_id\")) \\\n",
+    "                    .addAnalyzer(Mean(\"star_rating\")) \\\n",
+    "                    .addAnalyzer(Compliance(\"top star_rating\", \"star_rating >= 4.0\")) \\\n",
+    "                    .addAnalyzer(Correlation(\"total_votes\", \"star_rating\")) \\\n",
+    "                    .addAnalyzer(Correlation(\"total_votes\", \"helpful_votes\")) \\\n",
+    "                    .run()\n",
+    "                    \n",
+    "analysisResult_df = AnalyzerContext.successMetricsAsDataFrame(spark, analysisResult)\n",
+    "analysisResult_df.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>entity</th>\n",
+       "      <th>instance</th>\n",
+       "      <th>name</th>\n",
+       "      <th>value</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Column</td>\n",
+       "      <td>review_id</td>\n",
+       "      <td>Completeness</td>\n",
+       "      <td>1.000000e+00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Column</td>\n",
+       "      <td>review_id</td>\n",
+       "      <td>ApproxCountDistinct</td>\n",
+       "      <td>3.160409e+06</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Mutlicolumn</td>\n",
+       "      <td>total_votes,star_rating</td>\n",
+       "      <td>Correlation</td>\n",
+       "      <td>-7.388090e-04</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Dataset</td>\n",
+       "      <td>*</td>\n",
+       "      <td>Size</td>\n",
+       "      <td>3.010972e+06</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Column</td>\n",
+       "      <td>star_rating</td>\n",
+       "      <td>Mean</td>\n",
+       "      <td>3.999997e+00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>Column</td>\n",
+       "      <td>top star_rating</td>\n",
+       "      <td>Compliance</td>\n",
+       "      <td>7.499993e-01</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>Mutlicolumn</td>\n",
+       "      <td>total_votes,helpful_votes</td>\n",
+       "      <td>Correlation</td>\n",
+       "      <td>9.817923e-01</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "        entity                   instance                 name         value\n",
+       "0       Column                  review_id         Completeness  1.000000e+00\n",
+       "1       Column                  review_id  ApproxCountDistinct  3.160409e+06\n",
+       "2  Mutlicolumn    total_votes,star_rating          Correlation -7.388090e-04\n",
+       "3      Dataset                          *                 Size  3.010972e+06\n",
+       "4       Column                star_rating                 Mean  3.999997e+00\n",
+       "5       Column            top star_rating           Compliance  7.499993e-01\n",
+       "6  Mutlicolumn  total_votes,helpful_votes          Correlation  9.817923e-01"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "analysisResult_pandas_df = AnalyzerContext.successMetricsAsDataFrame(spark, analysisResult, pandas=True)\n",
+    "analysisResult_pandas_df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### For more info ... look at full list of analyzers in `docs/analyzers.md` "
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "conda_python3",
+   "language": "python",
+   "name": "conda_python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}