Skip to content

Commit

Permalink
Add a Json pipeline for PyGrain
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 698008868
  • Loading branch information
Conchylicultor authored and The kauldron Authors committed Nov 19, 2024
1 parent 307dbb4 commit 9a36180
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 0 deletions.
1 change: 1 addition & 0 deletions kauldron/data/py/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from kauldron.data.py.base import DataSourceBase
from kauldron.data.py.data_sources import DataSource
from kauldron.data.py.data_sources import Tfds
from kauldron.data.py.data_sources import Json
from kauldron.data.py.mixtures import Mix

# *****************************************************************************
Expand Down
36 changes: 36 additions & 0 deletions kauldron/data/py/data_sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from collections.abc import Mapping
import dataclasses
import functools
import json
from typing import Any, Optional

from etils import epath
Expand Down Expand Up @@ -55,3 +56,38 @@ def data_source(self) -> grain.RandomAccessDataSource:
data_dir=self.data_dir,
decoders=self.decoders,
)


# Should this be part of Grain ?
@dataclasses.dataclass(frozen=True)
class JsonDataSource(grain.RandomAccessDataSource):
"""Json data source.
Assumes that the json file is a list of examples.
"""

path: str

@functools.cached_property
def data(self) -> Mapping[str, Any]:
return json.loads(epath.Path(self.path).read_text())

def __len__(self) -> int:
return len(self.data)

def __getitem__(self, record_key):
return self.data[record_key]


@dataclasses.dataclass(frozen=True)
class Json(base.DataSourceBase):
"""Json pipeline.
Assumes that the json file is a list of examples.
"""

path: str

@functools.cached_property
def data_source(self) -> grain.RandomAccessDataSource:
return JsonDataSource(path=self.path)

0 comments on commit 9a36180

Please sign in to comment.