Skip to content

Commit

Permalink
adds revision in hf upload
Browse files Browse the repository at this point in the history
  • Loading branch information
guipenedo committed Jan 30, 2025
1 parent b105dcd commit b3daef2
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 2 deletions.
6 changes: 5 additions & 1 deletion src/datatrove/pipeline/writers/huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ def __init__(
expand_metadata: bool = True,
max_file_size: int = round(4.5 * 2**30), # 4.5GB, leave some room for the last batch
schema: Any = None,
revision: str | None = None,
):
"""
This class is intended to upload VERY LARGE datasets. Consider using `push_to_hub` or just using a
Expand All @@ -53,6 +54,7 @@ def __init__(
expand_metadata: save each metadata entry in a different column instead of as a dictionary
max_file_size: will create a new file when this size is exceeded (in bytes). -1 for no limit.
Filenames will have a number prepended (000_..., 001_..., etc)
revision: The git revision to commit from. Defaults to the head of the `"main"` branch
"""
self.dataset = dataset
self.private = private
Expand All @@ -78,6 +80,7 @@ def __init__(
)
self.operations = []
self._repo_init = False
self.revision = revision

def upload_files(self, *filenames):
if not self._repo_init:
Expand All @@ -88,7 +91,7 @@ def upload_files(self, *filenames):
for filename in filenames
]
logger.info(f"Uploading {','.join(filenames)} to the hub...")
preupload_lfs_files(self.dataset, repo_type="dataset", additions=additions)
preupload_lfs_files(self.dataset, repo_type="dataset", additions=additions, revision=self.revision)
logger.info(f"Upload of {','.join(filenames)} to the hub complete!")
if self.cleanup:
for filename in filenames:
Expand All @@ -109,6 +112,7 @@ def close(self, rank: int = 0):
repo_type="dataset",
operations=self.operations,
commit_message=f"DataTrove upload ({len(self.operations)} files)",
revision=self.revision,
)
break
except HfHubHTTPError as e:
Expand Down
9 changes: 8 additions & 1 deletion src/datatrove/utils/word_tokenizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,14 @@ def _do_tokenize(self, text: str):
# japanese has a max byte length
texts = [text] if self.language != "ja" else chunk_text_on_bytes(text, 40000)
self.tokenizer.max_length = len(text)
return [self.tokenizer(t, disable=["parser", "tagger", "ner"]) for t in texts]
try:
return [self.tokenizer(t, disable=["parser", "tagger", "ner"]) for t in texts]
except KeyError as e:
# this dumb string breaks the tokenizer completely
if "IS_ALPHA" in str(e):
return [self.tokenizer(t.replace("IS_ALPHA", ""), disable=["parser", "tagger", "ner"]) for t in texts]
else:
raise e

def word_tokenize(self, text: str) -> list[str]:
# Make sure to do all the token processing inside the memory zone, as after that memory address to tokens
Expand Down

0 comments on commit b3daef2

Please sign in to comment.