From 2d393ac7cc719538a8417e6f84544e194c4c80ee Mon Sep 17 00:00:00 2001 From: Philippe Moussalli Date: Wed, 17 May 2023 11:35:32 +0200 Subject: [PATCH] [Alpha pipeline] Write to hub component (#140) PR that adds the write to hub component. Right now we're running `to_parquet` method twice, once within the component itself to write to the hub and another time in fondant to write the dataset. We should work on implementing a `DataWriter` class huggingface/huggingface_hub#138. Another thing that we need to configure is how to properly pass secrets huggingface/huggingface_hub#139. Currently the hf token is passed as an argument that is visible in kubeflow to all users who have access to it. Linking also the issue for properly representing submitted images on the hub (right now they are represented as byte strings: https://github.com/huggingface/datasets/issues/5869 --- .../load_from_hub/fondant_component.yaml | 2 +- .../components/write_to_hub/Dockerfile | 20 ++++++ .../components/write_to_hub/__init__.py | 0 .../write_to_hub/fondant_component.yaml | 25 +++++++ .../components/write_to_hub/requirements.txt | 5 ++ .../components/write_to_hub/src/main.py | 65 +++++++++++++++++++ .../pipelines/simple_pipeline/pipeline.py | 11 +++- 7 files changed, 124 insertions(+), 4 deletions(-) create mode 100644 examples/pipelines/simple_pipeline/components/write_to_hub/Dockerfile create mode 100644 examples/pipelines/simple_pipeline/components/write_to_hub/__init__.py create mode 100644 examples/pipelines/simple_pipeline/components/write_to_hub/fondant_component.yaml create mode 100644 examples/pipelines/simple_pipeline/components/write_to_hub/requirements.txt create mode 100644 examples/pipelines/simple_pipeline/components/write_to_hub/src/main.py diff --git a/examples/pipelines/simple_pipeline/components/load_from_hub/fondant_component.yaml b/examples/pipelines/simple_pipeline/components/load_from_hub/fondant_component.yaml index 2860d8cd0..a31e8333a 100644 --- a/examples/pipelines/simple_pipeline/components/load_from_hub/fondant_component.yaml +++ b/examples/pipelines/simple_pipeline/components/load_from_hub/fondant_component.yaml @@ -19,4 +19,4 @@ output_subsets: args: dataset_name: description: Name of dataset on the hub - type: str \ No newline at end of file + type: str diff --git a/examples/pipelines/simple_pipeline/components/write_to_hub/Dockerfile b/examples/pipelines/simple_pipeline/components/write_to_hub/Dockerfile new file mode 100644 index 000000000..532ce5bb7 --- /dev/null +++ b/examples/pipelines/simple_pipeline/components/write_to_hub/Dockerfile @@ -0,0 +1,20 @@ +FROM --platform=linux/amd64 python:3.8-slim + +## System dependencies +RUN apt-get update && \ + apt-get upgrade -y && \ + apt-get install git -y + +# install requirements +COPY requirements.txt / +RUN pip3 install --no-cache-dir -r requirements.txt + +# Set the working directory to the component folder +WORKDIR /component/src + +# Copy over src-files and spec of the component +COPY src/ . +COPY fondant_component.yaml ../ + + +ENTRYPOINT ["python", "main.py"] \ No newline at end of file diff --git a/examples/pipelines/simple_pipeline/components/write_to_hub/__init__.py b/examples/pipelines/simple_pipeline/components/write_to_hub/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/pipelines/simple_pipeline/components/write_to_hub/fondant_component.yaml b/examples/pipelines/simple_pipeline/components/write_to_hub/fondant_component.yaml new file mode 100644 index 000000000..ac7e338ca --- /dev/null +++ b/examples/pipelines/simple_pipeline/components/write_to_hub/fondant_component.yaml @@ -0,0 +1,25 @@ +name: Write to hub +description: Component that writes a dataset to the hub +image: ghcr.io/ml6team/write_to_hub:latest + +input_subsets: + images: + fields: + data: + type: binary + + captions: + fields: + data: + type: utf8 + +args: + hf_token: + description: The hugging face token used to write to the hub + type: str + username: + description: The username under which to upload the dataset + type: str + dataset_name: + description: The name of the dataset to upload + type: str \ No newline at end of file diff --git a/examples/pipelines/simple_pipeline/components/write_to_hub/requirements.txt b/examples/pipelines/simple_pipeline/components/write_to_hub/requirements.txt new file mode 100644 index 000000000..a60057764 --- /dev/null +++ b/examples/pipelines/simple_pipeline/components/write_to_hub/requirements.txt @@ -0,0 +1,5 @@ +huggingface_hub==0.14.1 +git+https://github.com/ml6team/fondant.git +pyarrow>=7.0 +Pillow==9.4.0 +gcsfs==2023.4.0 \ No newline at end of file diff --git a/examples/pipelines/simple_pipeline/components/write_to_hub/src/main.py b/examples/pipelines/simple_pipeline/components/write_to_hub/src/main.py new file mode 100644 index 000000000..5ca1b487a --- /dev/null +++ b/examples/pipelines/simple_pipeline/components/write_to_hub/src/main.py @@ -0,0 +1,65 @@ +""" +This component writes an image dataset to the hub. +""" +import logging + +import huggingface_hub +import dask.dataframe as dd + +from fondant.component import FondantTransformComponent +from fondant.logger import configure_logging + +configure_logging() +logger = logging.getLogger(__name__) + + +class WriteToHubComponent(FondantTransformComponent): + def transform( + self, + dataframe: dd.DataFrame, + *, + hf_token: str, + username: str, + dataset_name: str, + ) -> dd.DataFrame: + """ + Args: + dataframe: Dask dataframe + hf_token: The hugging face token used to write to the hub + username: The username under which to upload the dataset + dataset_name: The name of the dataset to upload + + Returns: + dataset + """ + # login + huggingface_hub.login(token=hf_token) + + # Create HF dataset repository + repo_id = f"{username}/{dataset_name}" + repo_path = f"hf://datasets/{repo_id}" + logger.info(f"Creating HF dataset repository under ID: '{repo_id}'") + huggingface_hub.create_repo(repo_id=repo_id, repo_type="dataset", exist_ok=True) + + # Get columns to write and schema + write_columns = [] + schema = {} + for subset_name, subset in self.spec.input_subsets.items(): + write_columns.extend([f"{subset_name}_{field}" for field in subset.fields]) + # Get schema + subset_schema = { + f"{subset_name}_{field.name}": field.type.value + for field in subset.fields.values() + } + + schema.update(subset_schema) + + dataframe_hub = dataframe[write_columns] + dd.to_parquet(dataframe_hub, path=f"{repo_path}/data", schema=schema) + + return dataframe + + +if __name__ == "__main__": + component = WriteToHubComponent.from_file() + component.run() diff --git a/examples/pipelines/simple_pipeline/pipeline.py b/examples/pipelines/simple_pipeline/pipeline.py index 457f2c2de..ae9bfe312 100644 --- a/examples/pipelines/simple_pipeline/pipeline.py +++ b/examples/pipelines/simple_pipeline/pipeline.py @@ -31,9 +31,14 @@ }, ) -# TODO: ADD Arguments for embedding component later on -# MODEL_ID = "openai/clip-vit-large-patch14" -# BATCH_SIZE = 10 +write_to_hub_op = FondantComponentOp( + component_spec_path="components/write_to_hub/fondant_component.yaml", + arguments={ + "username": "philippemo", + "dataset_name": "test", + "hf_token": "", + }, +) pipeline = Pipeline(pipeline_name=pipeline_name, base_path=PipelineConfigs.BASE_PATH)