From f94cae424ee04d4ec87e1445fe061d105267c40f Mon Sep 17 00:00:00 2001 From: David Hall Date: Sat, 5 Oct 2024 00:21:59 -0500 Subject: [PATCH] set defaults to cache_dir so old behavior is preserved --- src/levanter/data/text.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/levanter/data/text.py b/src/levanter/data/text.py index 5e595b2a1..bcfcad397 100644 --- a/src/levanter/data/text.py +++ b/src/levanter/data/text.py @@ -577,6 +577,8 @@ def tagged_eval_sets( class LMDatasetConfig(LMDatasetSourceConfig, LMTaskConfig): """This class supports loading data both from HF Datasets and from a raw dataset of jsonl urls""" + cache_dir: Optional[str] = "cache/" + def train_set( self, seq_len: int, monitors: Union[bool, List[MetricsMonitor]] = True, *, key: Optional[PRNGKeyArray] = None ) -> AsyncDataset[np.ndarray]: @@ -705,6 +707,8 @@ def _convert_id_to_token(self, index: int) -> str: class LMMixtureDatasetConfig(LMTaskConfig): """This class represents a mixture of datasets with their associated weights.""" + cache_dir: Optional[str] = "cache/" + # data source configs and weights configs: Dict[str, LMDatasetSourceConfig] = field(default_factory=dict) """ configuration of each dataset source (urls, hf dataset id, etc.) """