From 3544a936fe515bb7a4a578baa75f2eae438517fa Mon Sep 17 00:00:00 2001
From: Danny Meijer <10511979+dannymeijer@users.noreply.github.com>
Date: Fri, 13 Dec 2024 18:28:01 +0100
Subject: [PATCH] refactor: improve type handling in
 DownloadFileFromUrlTransformation and enhance test type hints

---
 .../spark/transformations/download_files.py   |  2 +-
 .../transformations/test_download_files.py    | 42 ++++++++++++++-----
 2 files changed, 32 insertions(+), 12 deletions(-)

diff --git a/src/koheesio/spark/transformations/download_files.py b/src/koheesio/spark/transformations/download_files.py
index 296571c..3e134f2 100644
--- a/src/koheesio/spark/transformations/download_files.py
+++ b/src/koheesio/spark/transformations/download_files.py
@@ -146,7 +146,7 @@ def execute(self) -> Output:
         Download files from URLs in the specified column.
         """
         # Collect the URLs from the DataFrame and process them
-        source_column_name = get_column_name(self.column)  # type: ignore
+        source_column_name = self.column if isinstance(self.column, str) else get_column_name(self.column)  # type: ignore
         partition = {row.asDict()[source_column_name] for row in self.df.select(self.column).collect()}  # type: ignore
         self.func(partition)
 
diff --git a/tests/spark/transformations/test_download_files.py b/tests/spark/transformations/test_download_files.py
index 00f2c2d..7134533 100644
--- a/tests/spark/transformations/test_download_files.py
+++ b/tests/spark/transformations/test_download_files.py
@@ -1,10 +1,13 @@
+from pathlib import Path
+
 import pytest
 
-from koheesio.spark.transformations.download_files import DownloadFileFromUrlTransformation
+from koheesio.spark import DataFrame, SparkSession  # type: ignore
+from koheesio.spark.transformations.download_files import DownloadFileFromUrlTransformation  # type: ignore
 
 
 @pytest.fixture
-def input_df(spark):
+def input_df(spark: SparkSession) -> DataFrame:
     """A simple DataFrame containing two URLs."""
     return spark.createDataFrame(
         [
@@ -16,7 +19,7 @@ def input_df(spark):
 
 
 @pytest.fixture
-def download_path(tmp_path):
+def download_path(tmp_path: Path) -> Path:
     _path = tmp_path / "downloads"
     _path.mkdir(exist_ok=True)
     return _path
@@ -24,18 +27,41 @@ def download_path(tmp_path):
 
 class TestDownloadFileFromUrlTransformation:
     """
+    Input DataFrame:
+
     | key | url                                        |
     |-----|--------------------------------------------|
     | 101 |	http://www.textfiles.com/100/adventur.txt  |
     | 102 |	http://www.textfiles.com/100/arttext.fun   |
+
+    Output DataFrame:
+
+    | key | url                                        | downloaded_file_path  |
+    |-----|--------------------------------------------|-----------------------|
+    | 101 |	http://www.textfiles.com/100/adventur.txt  | downloads/adventur.txt|
+    | 102 |	http://www.textfiles.com/100/arttext.fun   | downloads/arttext.fun |
+
     """
 
-    def test_downloading_files(self, input_df, download_path):
+    def test_downloading_files(self, input_df: DataFrame, download_path: Path) -> None:
+        """Test that the files are downloaded and the DataFrame is transformed correctly."""
+        # Arrange
+        expected_data = [
+            "downloads/adventur.txt",
+            "downloads/arttext.fun",
+        ]
+
+        # Act
         transformed_df = DownloadFileFromUrlTransformation(
             column="url",
             download_path=download_path,
             target_column="downloaded_file_path",
         ).transform(input_df)
+        actual_data = sorted(
+            [row.asDict()["downloaded_file_path"] for row in transformed_df.select("downloaded_file_path").collect()]
+        )
+
+        # Assert
 
         # Check that adventur.txt and arttext.fun are actually downloaded
         assert (download_path / "adventur.txt").exists()
@@ -43,12 +69,6 @@ def test_downloading_files(self, input_df, download_path):
 
         assert transformed_df.count() == 2
         assert transformed_df.columns == ["key", "url", "downloaded_file_path"]
+
         # check that the rows of the output DataFrame are as expected
-        expected_data = [
-            "downloads/adventur.txt",
-            "downloads/arttext.fun",
-        ]
-        actual_data = sorted(
-            [row.asDict()["downloaded_file_path"] for row in transformed_df.select("downloaded_file_path").collect()]
-        )
         assert actual_data == expected_data