Skip to content

Commit

Permalink
deprecate LMSupervisedDataConfig, but keep it working for now (#806)
Browse files Browse the repository at this point in the history
  • Loading branch information
dlwh authored Nov 17, 2024
1 parent 3d5677e commit 558e5d6
Showing 1 changed file with 25 additions and 8 deletions.
33 changes: 25 additions & 8 deletions src/levanter/data/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import json
import logging
import os
import warnings
from dataclasses import dataclass
from functools import cached_property
from itertools import chain
Expand Down Expand Up @@ -668,8 +669,18 @@ def tagged_eval_sets(
CANONICAL_OUTPUT_FIELD = "response"


class SupervisedSourceConfigBase(Protocol):
def get_shard_source(self, split: str) -> Optional[ShardedDataSource[dict]]:
raise NotImplementedError

input_field: str
output_field: str
tags: Optional[List[str]]
cache_dir: str


@dataclass
class LMSupervisedDatasetConfig:
class LMSupervisedDatasetConfig(SupervisedSourceConfigBase):
"""Config for supervised fine-tuning datasets"""

cache_dir: str = "cache/"
Expand All @@ -688,15 +699,21 @@ class LMSupervisedDatasetConfig:
# Optional metadata
tags: Optional[List[str]] = None

def __post_init__(self):
warnings.warn(
"LMSupervisedDatasetConfig is deprecated. Use SupervisedHfSourceConfig or "
"SupervisedUrlSourceConfig instead.",
DeprecationWarning,
)

class SupervisedSourceConfigBase(Protocol):
def get_shard_source(self, split: str) -> Optional[ShardedDataSource[dict]]:
raise NotImplementedError

input_field: str
output_field: str
tags: Optional[List[str]]
cache_dir: str
if self.hf_dataset_name is not None:
return WrappedHFDataSource(self.hf_dataset_name, split=self.hf_dataset_split)
elif split != "validation":
raise ValueError("Only validation split is supported for local files")
else:
urls = [globbed for url in self.validation_urls for globbed in expand_glob(url)]
return JsonlDataSource(urls)


@dataclass(frozen=True)
Expand Down

0 comments on commit 558e5d6

Please sign in to comment.