From 1d532c9c2d9bd0f06c50dc4734941d9c681f0cce Mon Sep 17 00:00:00 2001
From: Tim Reichard <tim.reichard@encoura.org>
Date: Fri, 3 Nov 2023 13:28:33 -0500
Subject: [PATCH] Add common data science functions to ds_utils.py

---
 .pre-commit-config.yaml   |   2 +-
 .pylintrc                 |   2 +-
 HISTORY.rst               |  15 ++
 aioradio/ds_utils.py      | 286 +++++++++++++++++++++++++++++++-------
 aioradio/requirements.txt |  16 ++-
 setup.py                  |   8 +-
 6 files changed, 269 insertions(+), 60 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index fec1b1f..b52375b 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -2,7 +2,7 @@ default_language_version:
     python: python3.11
 repos:
 -   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.4.0
+    rev: v4.5.0
     hooks:
     -   id: check-added-large-files
     -   id: check-ast
diff --git a/.pylintrc b/.pylintrc
index 417d671..5d310c5 100644
--- a/.pylintrc
+++ b/.pylintrc
@@ -1,2 +1,2 @@
 [FORMAT]
-max-line-length=140
+max-line-length=150
diff --git a/HISTORY.rst b/HISTORY.rst
index 797950a..df05343 100644
--- a/HISTORY.rst
+++ b/HISTORY.rst
@@ -3,6 +3,21 @@ History
 =======
 
 
+v0.19.4 (2023-11-03)
+
+* Add common data science functions to ds_utils.py.
+* Update cython==3.0.5.
+* Update httpx==0.25.1.
+* Update pandas==2.1.2.
+* Update pylint==3.0.2.
+* Update pytest==7.4.3.
+* Update wheel==0.41.3.
+* Add python library haversine==2.8.0.
+* Add python library polars==0.19.12.
+* Add python library pyarrow==13.0.0.
+* Add python library pyspark==3.4.1.
+
+
 v0.19.3 (2023-10-20)
 
 * Add TrustServerCertificate option in sqlserver connection string, enabling use of driver {ODBC Driver 18 for SQL Server}.
diff --git a/aioradio/ds_utils.py b/aioradio/ds_utils.py
index 2de6693..7897db2 100644
--- a/aioradio/ds_utils.py
+++ b/aioradio/ds_utils.py
@@ -1,9 +1,11 @@
 """utils.py."""
 
+# pylint: disable=broad-except
 # pylint: disable=import-outside-toplevel
 # pylint: disable=invalid-name
 # pylint: disable=logging-fstring-interpolation
 # pylint: disable=no-member
+# pylint: disable=protected-access
 # pylint: disable=too-many-arguments
 # pylint: disable=too-many-boolean-expressions
 # pylint: disable=unnecessary-comprehension
@@ -16,12 +18,18 @@
 import os
 import pickle
 import warnings
+from math import cos, degrees, radians, sin
 from platform import system
 from tempfile import NamedTemporaryFile
 from time import sleep, time
 
 import boto3
+import numpy as np
+import pyarrow as pa
 import pandas as pd
+import polars as pl
+from haversine import haversine, Unit
+from pyspark.sql import SparkSession
 from smb.SMBConnection import SMBConnection
 
 warnings.simplefilter(action='ignore', category=UserWarning)
@@ -36,6 +44,158 @@
 c_handler.setFormatter(c_format)
 logger.addHandler(c_handler)
 
+spark = SparkSession.builder.getOrCreate()
+
+
+############################### Databricks functions ################################
+
+
+def db_catalog(env):
+    """Return the DataBricks catalog based on the passed in environment."""
+
+    catalog = ''
+    if env == 'sandbox':
+        catalog = 'dsc_sbx'
+    elif env == 'prod':
+        catalog = 'dsc_prd'
+
+    return catalog
+
+
+def sql_to_polars_df(sql):
+    """Get polars DataFrame from SQL query results."""
+
+    return pl.from_arrow(pa.Table.from_batches(spark.sql(sql)._collect_as_arrow()))
+
+
+def does_db_table_exists(name):
+    """Check if delta table exists in databricks."""
+
+    exists = False
+    try:
+        spark.sql(f"describe formatted {name}")
+        exists = True
+    except Exception:
+        pass
+
+    return exists
+
+
+def merge_spark_df_in_db(df, target, on, partition_by=None):
+    """Convert spark DF to staging table than merge with target table in
+    Databricks."""
+
+    stage = f"{target}_stage"
+
+    if not does_db_table_exists(target):
+        if partition_by is None:
+            df.write.option("delta.columnMapping.mode", "name").saveAsTable(target)
+        else:
+            df.write.option("delta.columnMapping.mode", "name").partitionBy(partition_by).saveAsTable(target)
+    else:
+        if partition_by is None:
+            df.write.option("delta.columnMapping.mode", "name").mode('overwrite').saveAsTable(stage)
+        else:
+            df.write.option("delta.columnMapping.mode", "name").mode('overwrite').partitionBy(partition_by).saveAsTable(stage)
+
+        on_clause = ' AND '.join(f'{target}.{col} = {stage}.{col}' for col in on)
+        match_clause = ', '.join(f'{target}.{col} = {stage}.{col}' for col in df.columns if col != 'CREATED_DATETIME')
+
+        try:
+            spark.sql(f'MERGE INTO {target} USING {stage} ON {on_clause} WHEN MATCHED THEN UPDATE SET {match_clause} WHEN NOT MATCHED THEN INSERT *')
+            spark.sql(f'DROP TABLE {stage}')
+        except Exception:
+            spark.sql(f'DROP TABLE {stage}')
+            raise
+
+
+def merge_pandas_df_in_db(df, target, on, partition_by=None):
+    """Convert pandas DF to staging table than merge with target table in
+    Databricks."""
+
+    stage = f"{target}_stage"
+
+    for col, dtype in df.dtypes.apply(lambda x: x.name).to_dict().items():
+        if dtype == 'object':
+            df[col] = df[col].astype('string[pyarrow]')
+            df[col].mask(df[col].isna(), '', inplace=True)
+        elif dtype == 'string':
+            # pyspark will throw an exception if strings are set to <NA> so convert to empty string
+            df[col].mask(df[col].isna(), '', inplace=True)
+
+    if not does_db_table_exists(target):
+        if partition_by is None:
+            spark.createDataFrame(df).write.option("delta.columnMapping.mode", "name").saveAsTable(target)
+        else:
+            spark.createDataFrame(df).write.option("delta.columnMapping.mode", "name").partitionBy(partition_by).saveAsTable(target)
+    else:
+        if partition_by is None:
+            spark.createDataFrame(df).write.option("delta.columnMapping.mode", "name").mode('overwrite').saveAsTable(stage)
+        else:
+            spark.createDataFrame(df).write.option("delta.columnMapping.mode", "name").mode('overwrite').partitionBy(partition_by).saveAsTable(stage)
+
+        on_clause = ' AND '.join(f'{target}.{col} = {stage}.{col}' for col in on)
+        match_clause = ', '.join(f'{target}.{col} = {stage}.{col}' for col in df.columns if col != 'CREATED_DATETIME')
+
+        try:
+            spark.sql(f'MERGE INTO {target} USING {stage} ON {on_clause} WHEN MATCHED THEN UPDATE SET {match_clause} WHEN NOT MATCHED THEN INSERT *')
+            spark.sql(f'DROP TABLE {stage}')
+        except Exception:
+            spark.sql(f'DROP TABLE {stage}')
+            raise
+
+
+################################## DataFrame functions ####################################
+
+
+def convert_pyspark_dtypes_to_pandas(df):
+    """The pyspark toPandas function converts strings to objects.
+
+    This function takes the resulting df and converts the object dtypes
+    to string[pyarrow], then it converts empty strings to pd.NA.
+    """
+
+    for col, dtype in df.dtypes.apply(lambda x: x.name).to_dict().items():
+
+        if dtype == 'object':
+            df[col] = df[col].astype('string[pyarrow]')
+            df[col].mask(df[col] == '', pd.NA, inplace=True)
+        elif (dtype.startswith('int') or dtype.startswith('float')) and not dtype.endswith('[pyarrow]'):
+            df[col] = df[col].astype(f'{dtype}[pyarrow]')
+        elif 'string' in dtype:
+            df[col] = df[col].astype('string[pyarrow]')
+            df[col].mask(df[col] == '', pd.NA, inplace=True)
+
+    return df
+
+
+def remove_pyarrow_dtypes(df):
+    """Switch pyarrow dtype to non pyarrow dtype (int8['pyarrow'] to int8)"""
+
+    df = df.astype({k: v.replace('[pyarrow]', '') for k, v in df.dtypes.apply(lambda x: x.name).to_dict().items()})
+    return df
+
+
+################################## AWS functions ####################################
+
+
+def get_boto3_session(env):
+    """Get Boto3 Session."""
+
+    aws_profile = os.getenv('AWS_PROFILE')
+
+    try:
+        if aws_profile is not None:
+            del os.environ['AWS_PROFILE']
+        aws_creds = get_aws_creds(env)
+        boto3_session = boto3.Session(**aws_creds)
+    except ValueError:
+        if aws_profile is not None:
+            os.environ["AWS_PROFILE"] = aws_profile
+        boto3_session = boto3.Session()
+
+    return boto3_session
+
 
 def file_to_s3(s3_client, local_filepath, s3_bucket, key):
     """Write file to s3."""
@@ -82,20 +242,6 @@ def delete_s3_object(s3_client, bucket, s3_prefix):
     return s3_client.delete_object(Bucket=bucket, Key=s3_prefix)
 
 
-def get_fice_institutions_map(db_config):
-    """Get mapping of fice to college from mssql table."""
-
-    from aioradio.pyodbc import pyodbc_query_fetchall
-
-    result = {}
-    with DbInfo(db_config) as target_db:
-        query = "SELECT FICE, Institution FROM EESFileuploadAssignments WHERE FileCategory = 'EnrollmentLens'"
-        rows = pyodbc_query_fetchall(conn=target_db.conn, query=query)
-        result = {fice: institution for fice, institution in rows}
-
-    return result
-
-
 def bytes_to_s3(s3_client, s3_bucket, key, body):
     """Write data in bytes to s3."""
 
@@ -185,24 +331,6 @@ def get_s3_pickle_to_object(s3_client, s3_bucket, key):
     return data
 
 
-def get_ftp_connection(secret_id, port=139, is_direct_tcp=False, env='sandbox'):
-    """Get SMB Connection."""
-
-    secret_client = get_boto3_session(env).client("secretsmanager", region_name='us-east-1')
-    creds = json.loads(secret_client.get_secret_value(SecretId=secret_id)['SecretString'])
-    conn = SMBConnection(
-        creds['user'],
-        creds['password'],
-        secret_id,
-        creds['server'],
-        use_ntlm_v2=True,
-        is_direct_tcp=is_direct_tcp
-    )
-    conn.connect(creds['server'], port)
-
-    return conn
-
-
 def get_aws_creds(env):
     """Get AWS credentials from environment variables."""
 
@@ -223,6 +351,79 @@ def get_aws_creds(env):
     return aws_creds
 
 
+get_s3_csv_to_df = get_large_s3_csv_to_df
+get_s3_parquet_to_df = get_large_s3_parquet_to_df
+
+
+################################# Misc functions ####################################
+
+
+def bearing(slat, elat, slon, elon):
+    """Bearing function."""
+
+    slat, elat, slon, elon = radians(slat), radians(elat), radians(slon), radians(elon)
+    var_dl = elon - slon
+    var_x = cos(elat) * sin(var_dl)
+    var_y = cos(slat) * sin(elat) - sin(slat) * cos(elat) * cos(var_dl)
+    return (degrees(np.arctan2(var_x, var_y)) + 360) % 360
+
+
+def apply_bearing(dataframe, latitude, longitude):
+    """Apply bearing function on split dataframe."""
+
+    return dataframe.apply(lambda x: bearing(x.LATITUDE, latitude, x.LONGITUDE, longitude), axis=1)
+
+
+def apply_haversine(dataframe, latitude, longitude):
+    """Apply haversine function on split dataframe."""
+
+    return dataframe.apply(lambda x: haversine((x.LATITUDE, x.LONGITUDE), (latitude, longitude), unit=Unit.MILES), axis=1)
+
+
+def logit(x, a, b, c, d):
+    """Logit function."""
+
+    return a / (1 + np.exp(-c * (x - d))) + b
+
+
+def apply_logit(dataframe, a, b, c, d):
+    """Apply logit function on split dataframe."""
+
+    return dataframe.apply(lambda x: logit(x, a, b, c, d))
+
+
+def get_fice_institutions_map(db_config):
+    """Get mapping of fice to college from mssql table."""
+
+    from aioradio.pyodbc import pyodbc_query_fetchall
+
+    result = {}
+    with DbInfo(db_config) as target_db:
+        query = "SELECT FICE, Institution FROM EESFileuploadAssignments WHERE FileCategory = 'EnrollmentLens'"
+        rows = pyodbc_query_fetchall(conn=target_db.conn, query=query)
+        result = {fice: institution for fice, institution in rows}
+
+    return result
+
+
+def get_ftp_connection(secret_id, port=139, is_direct_tcp=False, env='sandbox'):
+    """Get SMB Connection."""
+
+    secret_client = get_boto3_session(env).client("secretsmanager", region_name='us-east-1')
+    creds = json.loads(secret_client.get_secret_value(SecretId=secret_id)['SecretString'])
+    conn = SMBConnection(
+        creds['user'],
+        creds['password'],
+        secret_id,
+        creds['server'],
+        use_ntlm_v2=True,
+        is_direct_tcp=is_direct_tcp
+    )
+    conn.connect(creds['server'], port)
+
+    return conn
+
+
 def monitor_domino_run(domino, run_id, sleep_time=10):
     """Monitor domino job run and return True/False depending if job was
     successful."""
@@ -241,24 +442,6 @@ def monitor_domino_run(domino, run_id, sleep_time=10):
     return status
 
 
-def get_boto3_session(env):
-    """Get Boto3 Session."""
-
-    aws_profile = os.getenv('AWS_PROFILE')
-
-    try:
-        if aws_profile is not None:
-            del os.environ['AWS_PROFILE']
-        aws_creds = get_aws_creds(env)
-        boto3_session = boto3.Session(**aws_creds)
-    except ValueError:
-        if aws_profile is not None:
-            os.environ["AWS_PROFILE"] = aws_profile
-        boto3_session = boto3.Session()
-
-    return boto3_session
-
-
 def get_domino_connection(secret_id, project, host, env='sandbox'):
     """Get domino connection."""
 
@@ -268,6 +451,9 @@ def get_domino_connection(secret_id, project, host, env='sandbox'):
     return Domino(project=project, api_key=api_key, host=host)
 
 
+######################## Postgres or MSSQL Connection Classes #######################
+
+
 class DB_CONNECT():
     """[Class for database connection]
 
diff --git a/aioradio/requirements.txt b/aioradio/requirements.txt
index 0556a83..aedc1af 100644
--- a/aioradio/requirements.txt
+++ b/aioradio/requirements.txt
@@ -3,29 +3,33 @@ aiojobs==1.2.0
 backoff==2.2.1
 boto3==1.28.17
 botocore==1.31.17
-cython==3.0.4
+cython==3.0.5
 ddtrace==1.11.2
 dominodatalab==1.2.4
 fakeredis==1.10.1
 faust-cchardet==2.1.19
 flask==2.1.2
 flask-cors==3.0.10
-httpx==0.25.0
+haversine==2.8.0
+httpx==0.25.1
 mandrill==1.0.60
 moto==3.1.18
 openpyxl==3.0.10
 orjson==3.8.10
-pandas==2.1.1
+pandas==2.1.2
+polars==0.19.12
 pre-commit==3.5.0
 psycopg2-binary==2.9.9
-pylint==3.0.1
+pyarrow==13.0.0
+pylint==3.0.2
 pyodbc==4.0.39 --no-binary=pyodbc
 pysmb==1.2.9.1
-pytest==7.4.2
+pyspark==3.4.1
+pytest==7.4.3
 pytest-asyncio==0.21.1
 pytest-cov==4.1.0
 python-json-logger==2.0.7
 redis==3.5.3
 twine==4.0.2
 werkzeug==2.1.2
-wheel==0.41.2
+wheel==0.41.3
diff --git a/setup.py b/setup.py
index e0386fb..490f86a 100644
--- a/setup.py
+++ b/setup.py
@@ -7,7 +7,7 @@
     long_description = fileobj.read()
 
 setup(name='aioradio',
-    version='0.19.3',
+    version='0.19.4',
     description='Generic asynchronous i/o python utilities for AWS services (SQS, S3, DynamoDB, Secrets Manager), Redis, MSSQL (pyodbc), JIRA and more',
     long_description=long_description,
     long_description_content_type="text/markdown",
@@ -26,17 +26,21 @@
         'backoff>=2.1.2',
         'botocore==1.31.17',
         'boto3==1.28.17',
-        'faust-cchardet>=2.1.18',
         'ddtrace>=0.60.1',
+        'faust-cchardet>=2.1.18',
         'fakeredis>=1.7.1',
+        'haversine>=2.8.0',
         'httpx>=0.23.0',
         'mandrill>=1.0.60',
         'numpy>=1.19',
         'openpyxl==3.0.10',
         'orjson>=3.6.8',
         'pandas>=1.3.5',
+        'polars>=0.19.12',
+        'pyarrow>=13.0.0',
         'pysmb>=1.2.7',
         'python-json-logger>=2.0.2',
+        'pyspark>=3.4.1',
         'redis==3.5.3'
     ],
     include_package_data=True,