Skip to content

Commit

Permalink
feat: add Open Prices export
Browse files Browse the repository at this point in the history
  • Loading branch information
raphael0202 committed Nov 19, 2024
1 parent 095e0b1 commit 30c83a4
Show file tree
Hide file tree
Showing 7 changed files with 444 additions and 46 deletions.
43 changes: 4 additions & 39 deletions openfoodfacts_exports/exports/parquet/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,14 @@
import pyarrow as pa
import pyarrow.parquet as pq
import tqdm
from huggingface_hub import HfApi
from more_itertools import chunked
from openfoodfacts import Flavor
from openfoodfacts.utils import jsonl_iter

from openfoodfacts_exports import settings

from .beauty import BEAUTY_DTYPE_MAP, BEAUTY_PRODUCT_SCHEMA, BeautyProduct
from .common import Product
from .common import Product, push_parquet_file_to_hf
from .food import FOOD_DTYPE_MAP, FOOD_PRODUCT_SCHEMA, FoodProduct

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -67,7 +66,9 @@ def export_parquet(
shutil.move(tmp_converted_parquet_path, output_path)

if settings.ENABLE_HF_PUSH:
push_parquet_file_to_hf(data_path=output_path)
push_parquet_file_to_hf(
data_path=output_path, repo_id="openfoodfacts/product-database"
)
else:
logger.info("Hugging Face push is disabled.")
logger.info("JSONL to Parquet conversion and postprocessing completed.")
Expand Down Expand Up @@ -127,39 +128,3 @@ def convert_jsonl_to_parquet(

if writer is not None:
writer.close()


def push_parquet_file_to_hf(
data_path: Path,
repo_id: str = "openfoodfacts/product-database",
revision: str = "main",
commit_message: str = "Database updated",
) -> None:
"""Push a Parquet file to Hugging Face Hub.
Args:
data_path (Path): The path to the Parquet file to push. The name of the
file will be used as the path in the repository.
repo_id (str, optional): The repository ID on Hugging Face Hub.
Defaults to "openfoodfacts/product-database".
revision (str, optional): The revision to push the data to. Defaults to
"main".
commit_message (str, optional): The commit message. Defaults to
"Database updated".
"""
logger.info("Start pushing data to Hugging Face at %s", repo_id)
if not data_path.exists():
raise FileNotFoundError(f"Data is missing: {data_path}")
if data_path.suffix != ".parquet":
raise ValueError(f"A parquet file is expected. Got {data_path.suffix} instead.")
# We use the HF_Hub api since it gives us way more flexibility than
# push_to_hub()
HfApi().upload_file(
path_or_fileobj=data_path,
repo_id=repo_id,
revision=revision,
repo_type="dataset",
path_in_repo=data_path.name,
commit_message=commit_message,
)
logger.info("Data succesfully pushed to Hugging Face at %s", repo_id)
41 changes: 41 additions & 0 deletions openfoodfacts_exports/exports/parquet/common.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
import logging
from pathlib import Path

import pyarrow as pa
from huggingface_hub import HfApi
from pydantic import BaseModel, Field, model_validator

logger = logging.getLogger(__name__)


class ImageSize(BaseModel):
h: int | None = None
Expand Down Expand Up @@ -353,3 +359,38 @@ def parse_owner_fields(cls, data: dict):
]
)
)


def push_parquet_file_to_hf(
data_path: Path,
repo_id: str,
revision: str = "main",
commit_message: str = "Database updated",
) -> None:
"""Push a Parquet file to Hugging Face Hub.
Args:
data_path (Path): The path to the Parquet file to push. The name of the
file will be used as the path in the repository.
repo_id (str, optional): The repository ID on Hugging Face Hub.
revision (str, optional): The revision to push the data to. Defaults to
"main".
commit_message (str, optional): The commit message. Defaults to
"Database updated".
"""
logger.info("Start pushing data to Hugging Face at %s", repo_id)
if not data_path.exists():
raise FileNotFoundError(f"Data is missing: {data_path}")
if data_path.suffix != ".parquet":
raise ValueError(f"A parquet file is expected. Got {data_path.suffix} instead.")
# We use the HF_Hub api since it gives us way more flexibility than
# push_to_hub()
HfApi().upload_file(
path_or_fileobj=data_path,
repo_id=repo_id,
revision=revision,
repo_type="dataset",
path_in_repo=data_path.name,
commit_message=commit_message,
)
logger.info("Data succesfully pushed to Hugging Face at %s", repo_id)
Loading

0 comments on commit 30c83a4

Please sign in to comment.