[Alpha pipeline] Write to hub component (#140)

PR that adds the write to hub component. Right now we're running `to_parquet` method twice, once within the component itself to write to the hub and another time in fondant to write the dataset. We should work on implementing a `DataWriter` class huggingface/huggingface_hub#138. Another thing that we need to configure is how to properly pass secrets huggingface/huggingface_hub#139. Currently the hf token is passed as an argument that is visible in kubeflow to all users who have access to it. Linking also the issue for properly representing submitted images on the hub (right now they are represented as byte strings: huggingface/datasets#5869
ml6team · May 17, 2023 · 2d393ac · 2d393ac
1 parent 441c015
commit 2d393ac
Show file tree

Hide file tree

Showing 7 changed files with 124 additions and 4 deletions.
diff --git a/examples/pipelines/simple_pipeline/components/load_from_hub/fondant_component.yaml b/examples/pipelines/simple_pipeline/components/load_from_hub/fondant_component.yaml
@@ -19,4 +19,4 @@ output_subsets:
 args:
   dataset_name:
     description: Name of dataset on the hub
-    type: str
+    type: str
diff --git a/examples/pipelines/simple_pipeline/components/write_to_hub/Dockerfile b/examples/pipelines/simple_pipeline/components/write_to_hub/Dockerfile
@@ -0,0 +1,20 @@
+FROM --platform=linux/amd64 python:3.8-slim
+
+## System dependencies
+RUN apt-get update && \
+    apt-get upgrade -y && \
+    apt-get install git -y
+
+# install requirements
+COPY requirements.txt /
+RUN pip3 install --no-cache-dir -r requirements.txt
+
+# Set the working directory to the component folder
+WORKDIR /component/src
+
+# Copy over src-files and spec of the component
+COPY src/ .
+COPY fondant_component.yaml ../
+
+
+ENTRYPOINT ["python", "main.py"]
diff --git a/examples/pipelines/simple_pipeline/components/write_to_hub/__init__.py b/examples/pipelines/simple_pipeline/components/write_to_hub/__init__.py
diff --git a/examples/pipelines/simple_pipeline/components/write_to_hub/fondant_component.yaml b/examples/pipelines/simple_pipeline/components/write_to_hub/fondant_component.yaml
@@ -0,0 +1,25 @@
+name: Write to hub
+description: Component that writes a dataset to the hub
+image: ghcr.io/ml6team/write_to_hub:latest
+
+input_subsets:
+  images:
+    fields:
+      data:
+        type: binary
+
+  captions:
+    fields:
+      data:
+        type: utf8
+
+args:
+  hf_token:
+    description: The hugging face token used to write to the hub
+    type: str
+  username:
+    description: The username under which to upload the dataset
+    type: str
+  dataset_name:
+    description: The name of the dataset to upload
+    type: str
diff --git a/examples/pipelines/simple_pipeline/components/write_to_hub/requirements.txt b/examples/pipelines/simple_pipeline/components/write_to_hub/requirements.txt
@@ -0,0 +1,5 @@
+huggingface_hub==0.14.1
+git+https://github.com/ml6team/fondant.git
+pyarrow>=7.0
+Pillow==9.4.0
+gcsfs==2023.4.0
diff --git a/examples/pipelines/simple_pipeline/components/write_to_hub/src/main.py b/examples/pipelines/simple_pipeline/components/write_to_hub/src/main.py
@@ -0,0 +1,65 @@
+"""
+This component writes an image dataset to the hub.
+"""
+import logging
+
+import huggingface_hub
+import dask.dataframe as dd
+
+from fondant.component import FondantTransformComponent
+from fondant.logger import configure_logging
+
+configure_logging()
+logger = logging.getLogger(__name__)
+
+
+class WriteToHubComponent(FondantTransformComponent):
+    def transform(
+        self,
+        dataframe: dd.DataFrame,
+        *,
+        hf_token: str,
+        username: str,
+        dataset_name: str,
+    ) -> dd.DataFrame:
+        """
+        Args:
+            dataframe: Dask dataframe
+            hf_token: The hugging face token used to write to the hub
+            username: The username under which to upload the dataset
+            dataset_name: The name of the dataset to upload
+
+        Returns:
+            dataset
+        """
+        # login
+        huggingface_hub.login(token=hf_token)
+
+        # Create HF dataset repository
+        repo_id = f"{username}/{dataset_name}"
+        repo_path = f"hf://datasets/{repo_id}"
+        logger.info(f"Creating HF dataset repository under ID: '{repo_id}'")
+        huggingface_hub.create_repo(repo_id=repo_id, repo_type="dataset", exist_ok=True)
+
+        # Get columns to write and schema
+        write_columns = []
+        schema = {}
+        for subset_name, subset in self.spec.input_subsets.items():
+            write_columns.extend([f"{subset_name}_{field}" for field in subset.fields])
+            # Get schema
+            subset_schema = {
+                f"{subset_name}_{field.name}": field.type.value
+                for field in subset.fields.values()
+            }
+
+            schema.update(subset_schema)
+
+        dataframe_hub = dataframe[write_columns]
+        dd.to_parquet(dataframe_hub, path=f"{repo_path}/data", schema=schema)
+
+        return dataframe
+
+
+if __name__ == "__main__":
+    component = WriteToHubComponent.from_file()
+    component.run()
diff --git a/examples/pipelines/simple_pipeline/pipeline.py b/examples/pipelines/simple_pipeline/pipeline.py
@@ -31,9 +31,14 @@
     },
 )
 
-# TODO: ADD Arguments for embedding component later on
-# MODEL_ID = "openai/clip-vit-large-patch14"
-# BATCH_SIZE = 10
+write_to_hub_op = FondantComponentOp(
+    component_spec_path="components/write_to_hub/fondant_component.yaml",
+    arguments={
+        "username": "philippemo",
+        "dataset_name": "test",
+        "hf_token": "",
+    },
+)
 
 pipeline = Pipeline(pipeline_name=pipeline_name, base_path=PipelineConfigs.BASE_PATH)