Skip to content

Commit

Permalink
set defaults to cache_dir so old behavior is preserved
Browse files Browse the repository at this point in the history
  • Loading branch information
dlwh committed Oct 5, 2024
1 parent fd106fa commit f94cae4
Showing 1 changed file with 4 additions and 0 deletions.
4 changes: 4 additions & 0 deletions src/levanter/data/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -577,6 +577,8 @@ def tagged_eval_sets(
class LMDatasetConfig(LMDatasetSourceConfig, LMTaskConfig):
"""This class supports loading data both from HF Datasets and from a raw dataset of jsonl urls"""

cache_dir: Optional[str] = "cache/"

def train_set(
self, seq_len: int, monitors: Union[bool, List[MetricsMonitor]] = True, *, key: Optional[PRNGKeyArray] = None
) -> AsyncDataset[np.ndarray]:
Expand Down Expand Up @@ -705,6 +707,8 @@ def _convert_id_to_token(self, index: int) -> str:
class LMMixtureDatasetConfig(LMTaskConfig):
"""This class represents a mixture of datasets with their associated weights."""

cache_dir: Optional[str] = "cache/"

# data source configs and weights
configs: Dict[str, LMDatasetSourceConfig] = field(default_factory=dict)
""" configuration of each dataset source (urls, hf dataset id, etc.) """
Expand Down

0 comments on commit f94cae4

Please sign in to comment.