Skip to content

Commit

Permalink
final fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
ahmeda14960 committed Oct 23, 2024
1 parent c971ebf commit 4733f3b
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 6 deletions.
2 changes: 1 addition & 1 deletion src/levanter/data/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -729,7 +729,7 @@ def train_set(
monitors: Union[bool, List[MetricsMonitor]] = True,
*,
key: Optional[PRNGKeyArray] = None,
epochs: bool = False,
epochs: int = 0,
) -> AsyncDataset[np.ndarray]:

ds = self.token_seq_dataset("train", seq_len, monitors)
Expand Down
11 changes: 6 additions & 5 deletions src/levanter/main/train_lm.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,11 +128,12 @@ def main(config: TrainLmConfig):
)


# add epoch logging
total_tokens_future = callbacks.get_total_dataset_tokens(train_dataset.dataset, config.model.seq_len)
trainer.add_hook(
callbacks.log_epoch_progress(total_tokens_future, Pos.size, trainer.config.train_batch_size), every=1
)
# add epoch logging if epochs specified
if config.epoch > 0:
total_tokens_future = callbacks.get_total_dataset_tokens(train_dataset.dataset, config.model.seq_len)
trainer.add_hook(
callbacks.log_epoch_progress(total_tokens_future, Pos.size, trainer.config.train_batch_size), every=1
)

# to do partitioning, our dimensions have to be divisible by the size of the physical axes they're mapped to
# For most things, we just insist you specify the config right, but tokenizers often have strange numbers of
Expand Down

0 comments on commit 4733f3b

Please sign in to comment.