From 2d393ac7cc719538a8417e6f84544e194c4c80ee Mon Sep 17 00:00:00 2001
From: Philippe Moussalli <philippe.moussalli95@gmail.com>
Date: Wed, 17 May 2023 11:35:32 +0200
Subject: [PATCH] [Alpha pipeline] Write to hub component (#140)

PR that adds the write to hub component.

Right now we're running `to_parquet` method twice, once within the
component itself to write to the hub and another time in fondant to
write the dataset. We should work on implementing a `DataWriter` class
huggingface/huggingface_hub#138.

Another thing that we need to configure is how to properly pass secrets
huggingface/huggingface_hub#139. Currently the hf token is passed as an
argument that is visible in kubeflow to all users who have access to it.

Linking also the issue for properly representing submitted images on the
hub (right now they are represented as byte strings:
https://github.com/huggingface/datasets/issues/5869
---
 .../load_from_hub/fondant_component.yaml      |  2 +-
 .../components/write_to_hub/Dockerfile        | 20 ++++++
 .../components/write_to_hub/__init__.py       |  0
 .../write_to_hub/fondant_component.yaml       | 25 +++++++
 .../components/write_to_hub/requirements.txt  |  5 ++
 .../components/write_to_hub/src/main.py       | 65 +++++++++++++++++++
 .../pipelines/simple_pipeline/pipeline.py     | 11 +++-
 7 files changed, 124 insertions(+), 4 deletions(-)
 create mode 100644 examples/pipelines/simple_pipeline/components/write_to_hub/Dockerfile
 create mode 100644 examples/pipelines/simple_pipeline/components/write_to_hub/__init__.py
 create mode 100644 examples/pipelines/simple_pipeline/components/write_to_hub/fondant_component.yaml
 create mode 100644 examples/pipelines/simple_pipeline/components/write_to_hub/requirements.txt
 create mode 100644 examples/pipelines/simple_pipeline/components/write_to_hub/src/main.py

diff --git a/examples/pipelines/simple_pipeline/components/load_from_hub/fondant_component.yaml b/examples/pipelines/simple_pipeline/components/load_from_hub/fondant_component.yaml
index 2860d8cd0..a31e8333a 100644
--- a/examples/pipelines/simple_pipeline/components/load_from_hub/fondant_component.yaml
+++ b/examples/pipelines/simple_pipeline/components/load_from_hub/fondant_component.yaml
@@ -19,4 +19,4 @@ output_subsets:
 args:
   dataset_name:
     description: Name of dataset on the hub
-    type: str
\ No newline at end of file
+    type: str
diff --git a/examples/pipelines/simple_pipeline/components/write_to_hub/Dockerfile b/examples/pipelines/simple_pipeline/components/write_to_hub/Dockerfile
new file mode 100644
index 000000000..532ce5bb7
--- /dev/null
+++ b/examples/pipelines/simple_pipeline/components/write_to_hub/Dockerfile
@@ -0,0 +1,20 @@
+FROM --platform=linux/amd64 python:3.8-slim
+
+## System dependencies
+RUN apt-get update && \
+    apt-get upgrade -y && \
+    apt-get install git -y
+
+# install requirements
+COPY requirements.txt /
+RUN pip3 install --no-cache-dir -r requirements.txt
+
+# Set the working directory to the component folder
+WORKDIR /component/src
+
+# Copy over src-files and spec of the component
+COPY src/ .
+COPY fondant_component.yaml ../
+
+
+ENTRYPOINT ["python", "main.py"]
\ No newline at end of file
diff --git a/examples/pipelines/simple_pipeline/components/write_to_hub/__init__.py b/examples/pipelines/simple_pipeline/components/write_to_hub/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/examples/pipelines/simple_pipeline/components/write_to_hub/fondant_component.yaml b/examples/pipelines/simple_pipeline/components/write_to_hub/fondant_component.yaml
new file mode 100644
index 000000000..ac7e338ca
--- /dev/null
+++ b/examples/pipelines/simple_pipeline/components/write_to_hub/fondant_component.yaml
@@ -0,0 +1,25 @@
+name: Write to hub
+description: Component that writes a dataset to the hub
+image: ghcr.io/ml6team/write_to_hub:latest
+
+input_subsets:
+  images:
+    fields:
+      data:
+        type: binary
+
+  captions:
+    fields:
+      data:
+        type: utf8
+
+args:
+  hf_token:
+    description: The hugging face token used to write to the hub
+    type: str
+  username:
+    description: The username under which to upload the dataset
+    type: str
+  dataset_name:
+    description: The name of the dataset to upload
+    type: str
\ No newline at end of file
diff --git a/examples/pipelines/simple_pipeline/components/write_to_hub/requirements.txt b/examples/pipelines/simple_pipeline/components/write_to_hub/requirements.txt
new file mode 100644
index 000000000..a60057764
--- /dev/null
+++ b/examples/pipelines/simple_pipeline/components/write_to_hub/requirements.txt
@@ -0,0 +1,5 @@
+huggingface_hub==0.14.1
+git+https://github.com/ml6team/fondant.git
+pyarrow>=7.0
+Pillow==9.4.0
+gcsfs==2023.4.0
\ No newline at end of file
diff --git a/examples/pipelines/simple_pipeline/components/write_to_hub/src/main.py b/examples/pipelines/simple_pipeline/components/write_to_hub/src/main.py
new file mode 100644
index 000000000..5ca1b487a
--- /dev/null
+++ b/examples/pipelines/simple_pipeline/components/write_to_hub/src/main.py
@@ -0,0 +1,65 @@
+"""
+This component writes an image dataset to the hub.
+"""
+import logging
+
+import huggingface_hub
+import dask.dataframe as dd
+
+from fondant.component import FondantTransformComponent
+from fondant.logger import configure_logging
+
+configure_logging()
+logger = logging.getLogger(__name__)
+
+
+class WriteToHubComponent(FondantTransformComponent):
+    def transform(
+        self,
+        dataframe: dd.DataFrame,
+        *,
+        hf_token: str,
+        username: str,
+        dataset_name: str,
+    ) -> dd.DataFrame:
+        """
+        Args:
+            dataframe: Dask dataframe
+            hf_token: The hugging face token used to write to the hub
+            username: The username under which to upload the dataset
+            dataset_name: The name of the dataset to upload
+
+        Returns:
+            dataset
+        """
+        # login
+        huggingface_hub.login(token=hf_token)
+
+        # Create HF dataset repository
+        repo_id = f"{username}/{dataset_name}"
+        repo_path = f"hf://datasets/{repo_id}"
+        logger.info(f"Creating HF dataset repository under ID: '{repo_id}'")
+        huggingface_hub.create_repo(repo_id=repo_id, repo_type="dataset", exist_ok=True)
+
+        # Get columns to write and schema
+        write_columns = []
+        schema = {}
+        for subset_name, subset in self.spec.input_subsets.items():
+            write_columns.extend([f"{subset_name}_{field}" for field in subset.fields])
+            # Get schema
+            subset_schema = {
+                f"{subset_name}_{field.name}": field.type.value
+                for field in subset.fields.values()
+            }
+
+            schema.update(subset_schema)
+
+        dataframe_hub = dataframe[write_columns]
+        dd.to_parquet(dataframe_hub, path=f"{repo_path}/data", schema=schema)
+
+        return dataframe
+
+
+if __name__ == "__main__":
+    component = WriteToHubComponent.from_file()
+    component.run()
diff --git a/examples/pipelines/simple_pipeline/pipeline.py b/examples/pipelines/simple_pipeline/pipeline.py
index 457f2c2de..ae9bfe312 100644
--- a/examples/pipelines/simple_pipeline/pipeline.py
+++ b/examples/pipelines/simple_pipeline/pipeline.py
@@ -31,9 +31,14 @@
     },
 )
 
-# TODO: ADD Arguments for embedding component later on
-# MODEL_ID = "openai/clip-vit-large-patch14"
-# BATCH_SIZE = 10
+write_to_hub_op = FondantComponentOp(
+    component_spec_path="components/write_to_hub/fondant_component.yaml",
+    arguments={
+        "username": "philippemo",
+        "dataset_name": "test",
+        "hf_token": "",
+    },
+)
 
 pipeline = Pipeline(pipeline_name=pipeline_name, base_path=PipelineConfigs.BASE_PATH)