Nike-Inc · dannymeijer · Nov 29, 2024 · Oct 29, 2024 · Oct 30, 2024 · Nov 8, 2024
@@ -6,6 +6,7 @@ on:
   pull_request:
     branches:
     - main
+    - release/*
   workflow_dispatch:
     inputs:
       logLevel:
@@ -40,8 +41,8 @@ jobs:
         fetch-depth: 0
         ref: ${{ github.event.pull_request.head.ref }}
         repository: ${{ github.event.pull_request.head.repo.full_name }}
-    - name: Fetch main branch
-      run: git fetch origin main:main
+    - name: Fetch target branch
+      run: git fetch origin ${{ github.event.pull_request.head.ref || 'main'}}:${{ github.event.pull_request.base.ref || 'main'}}
     - name: Check changes
       id: check
       run: |
@@ -61,7 +62,7 @@ jobs:
 
   tests:
     needs: check_changes
-    if: needs.check_changes.outputs.python_changed > 0 || needs.check_changes.outputs.toml_changed > 0 || github.event_name == 'workflow_dispatch'
+    if: needs.check_changes.outputs.python_changed > 0 || needs.check_changes.outputs.toml_changed > 0 || github.event_name == 'workflow_dispatch' || ${{ github.event.pull_request.head.repo.owner.login }} != ${{ github.event.pull_request.base.repo.owner.login }}
     name: Python ${{ matrix.python-version }} with PySpark ${{ matrix.pyspark-version }} on ${{ startsWith(matrix.os, 'macos-') && 'macOS' || startsWith(matrix.os, 'windows-') && 'Windows' || 'Linux' }}
     runs-on: ${{ matrix.os }}
 
@@ -71,10 +72,12 @@ jobs:
 #        os: [ubuntu-latest, windows-latest, macos-latest]  # FIXME: Add Windows and macOS
         os: [ubuntu-latest]
         python-version: ['3.9', '3.10', '3.11', '3.12']
-        pyspark-version: ['33', '34', '35']
+        pyspark-version: ['33', '34', '35', '35r']
         exclude:
           - python-version: '3.9'
             pyspark-version: '35'
+          - python-version: '3.9'
+            pyspark-version: '35r'
           - python-version: '3.11'
             pyspark-version: '33'
           - python-version: '3.11'
@@ -100,7 +103,7 @@ jobs:
 #        hatch fmt --check --python=${{ matrix.python-version }}
 
     - name: Run tests
-      run: hatch test --python=${{ matrix.python-version }} -i version=pyspark${{ matrix.pyspark-version }}
+      run: hatch test --python=${{ matrix.python-version }} -i version=pyspark${{ matrix.pyspark-version }} --verbose
 
   # https://github.com/marketplace/actions/alls-green#why
   final_check: # This job does nothing and is only used for the branch protection

@@ -139,3 +139,4 @@ out/**
 
 # DevContainer
 .devcontainer
+uv.lock
@@ -194,23 +194,35 @@ the `pyproject.toml` entry mentioned above or installing through pip.
 ### Integrations
 
 - __Spark Expectations:__   
-    Available through the `koheesio.steps.integration.spark.dq.spark_expectations` module; installable through the `se` extra.
+    Available through the `koheesio.integrations.spark.dq.spark_expectations` module; installable through the `se` extra.
     - SE Provides Data Quality checks for Spark DataFrames.
     - For more information, refer to the [Spark Expectations docs](https://engineering.nike.com/spark-expectations).
 
-[//]: # (- **Brickflow:** Available through the `koheesio.steps.integration.workflow` module; installable through the `bf` extra.)
+- __Spark Connect and Delta:__
+    Koheesio is ready to be used with Spark Connect. In case you are using Delta package in combination with a remote/connect session, you are getting full support in Databricks and partial support for Delta package in Apache Spark. Full support for Delta in Apache Spark is coming with the release of PySpark 4.0.
+    - The spark extra can be installed by adding `koheesio[spark]` to the `pyproject.toml` entry mentioned above.
+    - The spark module is available through the `koheesio.spark` module.
+    - The delta module is available through the `koheesio.spark.writers.delta` module.
+    - For more information, refer to the [Databricks documentation](https://docs.databricks.com/).
+    - For more information on Apache Spark, refer to the [Apache Spark documentation](https://spark.apache.org/docs/latest/).
+
+[//]: # (- **Brickflow:** Available through the `koheesio.integrations.workflow` module; installable through the `bf` extra.)
 [//]: # (    - Brickflow is a workflow orchestration tool that allows you to define and execute workflows in a declarative way.)
 [//]: # (    - For more information, refer to the [Brickflow docs]&#40;https://engineering.nike.com/brickflow&#41;)
 
 - __Box__:  
-    Available through the `koheesio.integration.box` module; installable through the `box` extra.
+    Available through the `koheesio.integrations.box` module; installable through the `box` extra.
     - [Box](https://www.box.com) is a cloud content management and file sharing service for businesses.
 
 - __SFTP__:  
-    Available through the `koheesio.integration.spark.sftp` module; installable through the `sftp` extra.
+    Available through the `koheesio.integrations.spark.sftp` module; installable through the `sftp` extra.
     - SFTP is a network protocol used for secure file transfer over a secure shell.
     - The SFTP integration of Koheesio relies on [paramiko](https://www.paramiko.org/)
 
+- __Snowflake__:
+    Available through the `koheesio.integrations.snowflake` module; installable through the `snowflake` extra.
+    - [Snowflake](https://www.snowflake.com) is a cloud-based data warehousing platform.
+
 [//]: # (TODO: add implementations)
 [//]: # (## Implementations)
 [//]: # (TODO: add async extra)

@@ -76,7 +76,7 @@ Here's an example of a `ColumnsTransformation`:
 
 ```python
 from pyspark.sql import functions as f
-from koheesio.steps.transformations import ColumnsTransformation
+from koheesio.spark.transformations import ColumnsTransformation
 
 class AddOne(ColumnsTransformation):
     def execute(self):
@@ -109,7 +109,7 @@ Here's an example of a `ColumnsTransformationWithTarget`:
 
 ```python
 from pyspark.sql import Column
-from koheesio.steps.transformations import ColumnsTransformationWithTarget
+from koheesio.spark.transformations import ColumnsTransformationWithTarget
 
 class AddOneWithTarget(ColumnsTransformationWithTarget):
     def func(self, col: Column):
@@ -167,7 +167,7 @@ examples:
 
     ```python
     from pyspark.sql import SparkSession
-    from koheesio.steps.transformations import DataframeLookup, JoinMapping, TargetColumn, JoinType
+    from koheesio.spark.transformations.lookup import DataframeLookup, JoinMapping, TargetColumn, JoinType
 
     spark = SparkSession.builder.getOrCreate()
     left_df = spark.createDataFrame([(1, "A"), (2, "B")], ["id", "value"])
@@ -191,7 +191,7 @@ examples:
 
     ```python
     from pyspark.sql import SparkSession
-    from koheesio.steps.transformations import HashUUID5
+    from koheesio.spark.transformations.uuid5 import HashUUID5
 
     spark = SparkSession.builder.getOrCreate()
     df = spark.createDataFrame([(1, "A"), (2, "B")], ["id", "value"])
@@ -245,8 +245,8 @@ how to chain transformations:
 
 ```python
 from pyspark.sql import SparkSession
-from koheesio.steps.transformations import HashUUID5
-from koheesio.steps.transformations import DataframeLookup, JoinMapping, TargetColumn, JoinType
+from koheesio.spark.transformations.uuid5 import HashUUID5
+from koheesio.spark.transformations.lookup import DataframeLookup, JoinMapping, TargetColumn, JoinType
 
 # Create a SparkSession
 spark = SparkSession.builder.getOrCreate()

@@ -36,8 +36,8 @@ Partitioning is a technique that divides your data into smaller, more manageable
 allows you to specify the partitioning scheme for your data when writing it to a target.
 
 ```python
-from koheesio.steps.writers.delta import DeltaTableWriter
-from koheesio.tasks.etl_task import EtlTask
+from koheesio.spark.writers.delta import DeltaTableWriter
+from koheesio.spark.etl_task import EtlTask
 
 class MyTask(EtlTask):
     target = DeltaTableWriter(table="my_table", partitionBy=["column1", "column2"])
@@ -52,7 +52,7 @@ class MyTask(EtlTask):
 [//]: # ()
 [//]: # (```python)
 
-[//]: # (from koheesio.steps.transformations.cache import CacheTransformation)
+[//]: # (from koheesio.spark.transformations.cache import CacheTransformation)
 
 [//]: # ()
 [//]: # (class MyTask&#40;EtlTask&#41;:)

@@ -19,23 +19,6 @@
     ```
 </details>
 
-<details>
-    <summary>poetry</summary>
-
-    If you're using Poetry, add the following entry to the `pyproject.toml` file:
-
-    ```toml title="pyproject.toml"
-    [[tool.poetry.source]]
-    name = "nike"
-    url = "https://artifactory.nike.com/artifactory/api/pypi/python-virtual/simple"
-    secondary = true
-    ```
-
-    ```bash
-    poetry add koheesio
-    ```
-</details>
-
 <details>
     <summary>pip</summary>
 

@@ -1,5 +1,23 @@
 # Simple Examples
 
+## Bring your own SparkSession
+
+The Koheesio Spark module does not set up a SparkSession for you. You need to create a SparkSession before using 
+Koheesio spark classes. This is the entry point for any Spark functionality, allowing the step to interact with the 
+Spark cluster.
+
+- Every `SparkStep` has a `spark` attribute, which is the active SparkSession.
+- Koheesio supports both local and remote (connect) Spark Sessions
+- The SparkSession you created can be explicitly passed to the `SparkStep` constructor (this is optional)
+
+To create a simple SparkSession, you can use the following code:
+
+```python
+from pyspark.sql import SparkSession
+
+spark = SparkSession.builder.getOrCreate()
+```
+
 ## Creating a Custom Step
 
 This example demonstrates how to use the `SparkStep` class from the `koheesio` library to create a custom step named 
@@ -8,7 +26,7 @@ This example demonstrates how to use the `SparkStep` class from the `koheesio` l
 ### Code
 
 ```python
-from koheesio.steps.step import SparkStep
+from koheesio.spark import SparkStep
 
 class HelloWorldStep(SparkStep):
     message: str
@@ -21,7 +39,7 @@ class HelloWorldStep(SparkStep):
 ### Usage
 
 ```python
-hello_world_step = HelloWorldStep(message="Hello, World!")
+hello_world_step = HelloWorldStep(message="Hello, World!", spark=spark)  # optionally pass the spark session
 hello_world_step.execute()
 
 hello_world_step.output.df.show()
@@ -33,16 +51,15 @@ The `HelloWorldStep` class is a `SparkStep` in Koheesio, designed to generate a
 
 - `HelloWorldStep` inherits from `SparkStep`, a fundamental building block in Koheesio for creating data processing steps with Apache Spark.
 - It has a `message` attribute. When creating an instance of `HelloWorldStep`, you can pass a custom message that will be used in the DataFrame.
-- `SparkStep` has a `spark` attribute, which is the active SparkSession. This is the entry point for any Spark functionality, allowing the step to interact with the Spark cluster.
 - `SparkStep` also includes an `Output` class, used to store the output of the step. In this case, `Output` has a `df` attribute to store the output DataFrame.
 - The `execute` method creates a DataFrame with the custom message and stores it in `output.df`. It doesn't return a value explicitly; instead, the output DataFrame can be accessed via `output.df`.
 - Koheesio uses pydantic for automatic validation of the step's input and output, ensuring they are correctly defined and of the correct types.
+- The `spark` attribute can be optionally passed to the constructor when creating an instance of `HelloWorldStep`. This allows you to use an existing SparkSession or create a new one specifically for the step.
+- If no `SparkSession` is passed to a `SparkStep`, Koheesio will use the `SparkSession.getActiveSession()` method to attempt retrieving an active SparkSession. If no active session is found, your code will not work.
 
 Note: Pydantic is a data validation library that provides a way to validate that the data (in this case, the input and output of the step) conforms to the expected format.
 
 
----
-
 ## Creating a Custom Task
 
 This example demonstrates how to use the `EtlTask` from the `koheesio` library to create a custom task named `MyFavoriteMovieTask`.
@@ -51,9 +68,10 @@ This example demonstrates how to use the `EtlTask` from the `koheesio` library t
 
 ```python
 from typing import Any
-from pyspark.sql import DataFrame, functions as f
-from koheesio.steps.transformations import Transform
-from koheesio.tasks.etl_task import EtlTask
+from pyspark.sql import functions as f
+from koheesio.spark import DataFrame
+from koheesio.spark.transformations.transform import Transform
+from koheesio.spark.etl_task import EtlTask
 
 
 def add_column(df: DataFrame, target_column: str, value: Any):
@@ -104,8 +122,8 @@ source:
 ```python
 from pyspark.sql import SparkSession
 from koheesio.context import Context
-from koheesio.steps.readers import DummyReader
-from koheesio.steps.writers.dummy import DummyWriter
+from koheesio.spark.readers.dummy import DummyReader
+from koheesio.spark.writers.dummy import DummyWriter
 
 context = Context.from_yaml("sample.yaml")
 

@@ -9,13 +9,15 @@ Unit testing involves testing individual components of the software in isolation
 Here's an example of how to unit test a Koheesio task:
 
 ```python
-from koheesio.tasks.etl_task import EtlTask
-from koheesio.steps.readers import DummyReader
-from koheesio.steps.writers.dummy import DummyWriter
-from koheesio.steps.transformations import Transform
-from pyspark.sql import SparkSession, DataFrame
+from pyspark.sql import SparkSession
 from pyspark.sql.functions import col
 
+from koheesio.spark import DataFrame
+from koheesio.spark.etl_task import EtlTask
+from koheesio.spark.readers.dummy import DummyReader
+from koheesio.spark.writers.dummy import DummyWriter
+from koheesio.spark.transformations.transform import Transform
+
 
 def filter_age(df: DataFrame) -> DataFrame:
     return df.filter(col("Age") > 18)
@@ -62,12 +64,12 @@ Here's an example of how to write an integration test for this task:
 
 ```python
 # my_module.py
-from koheesio.tasks.etl_task import EtlTask
-from koheesio.spark.readers.delta import DeltaReader
-from koheesio.steps.writers.delta import DeltaWriter
-from koheesio.steps.transformations import Transform
-from koheesio.context import Context
 from pyspark.sql.functions import col
+from koheesio.spark.etl_task import EtlTask
+from koheesio.spark.readers.delta import DeltaTableReader
+from koheesio.spark.writers.delta import DeltaTableWriter
+from koheesio.spark.transformations.transform import Transform
+from koheesio.context import Context
 
 
 def filter_age(df):
@@ -84,8 +86,8 @@ context = Context({
 })
 
 task = EtlTask(
-    source=DeltaReader(**context.reader_options),
-    target=DeltaWriter(**context.writer_options),
+    source=DeltaTableReader(**context.reader_options),
+    target=DeltaTableWriter(**context.writer_options),
     transformations=[
         Transform(filter_age)
     ]
@@ -97,11 +99,11 @@ Now, let's create a test for this task. We'll use pytest and unittest.mock to mo
 ```python
 # test_my_module.py
 import pytest
-from unittest.mock import MagicMock, patch
+from unittest.mock import patch
 from pyspark.sql import SparkSession
 from koheesio.context import Context
-from koheesio.steps.readers import Reader
-from koheesio.steps.writers import Writer
+from koheesio.spark.readers import Reader
+from koheesio.spark.writers import Writer
 
 from my_module import task
 

@@ -40,6 +40,12 @@ hatch-install:
 	fi
 init: hatch-install
 
+.PHONY: sync  ## hatch - Update dependencies if you changed project dependencies in pyproject.toml
+.PHONY: update  ## hatch - alias for sync (if you are used to poetry, thi is similar to running `poetry update`)
+sync:
+	@hatch run dev:uv sync --all-extras
+update: sync
+
 #  Code Quality
 .PHONY: black black-fmt ## code quality - Use black to (re)format the codebase
 black-fmt:
@@ -105,16 +111,16 @@ coverage: cov
 all-tests:
 	@echo "\033[1mRunning all tests:\033[0m\n\033[35m This will run the full test suite\033[0m"
 	@echo "\033[1;31mWARNING:\033[0;33m This may take upward of 20-30 minutes to complete!\033[0m"
-	@hatch test --no-header --no-summary
+	@hatch test --no-header
 .PHONY: spark-tests  ## testing - Run SPARK tests in ALL environments
 spark-tests:
 	@echo "\033[1mRunning Spark tests:\033[0m\n\033[35m This will run the Spark test suite against all specified environments\033[0m"
 	@echo "\033[1;31mWARNING:\033[0;33m This may take upward of 20-30 minutes to complete!\033[0m"
-	@hatch test -m spark --no-header --no-summary
+	@hatch test -m spark --no-header
 .PHONY: non-spark-tests  ## testing - Run non-spark tests in ALL environments
 non-spark-tests:
 	@echo "\033[1mRunning non-Spark tests:\033[0m\n\033[35m This will run the non-Spark test suite against all specified environments\033[0m"
-	@hatch test -m "not spark" --no-header --no-summary
+	@hatch test -m "not spark" --no-header
 
 .PHONY: dev-test ## testing - Run pytest, with all tests in the dev environment
 dev-test:
Original file line number	Diff line number	Diff line change
Expand Up		@@ -139,3 +139,4 @@ out/**

		# DevContainer
		.devcontainer
		uv.lock