-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'main' into 50-evals-research-notebook-sample
- Loading branch information
Showing
48 changed files
with
1,197 additions
and
641 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
#!/usr/bin/env python3 | ||
|
||
import argparse | ||
|
||
from datasets import Dataset | ||
from transformers import AutoTokenizer | ||
|
||
from delphi.dataset.tokenization import tokenize_dataset | ||
from delphi.eval.utils import load_validation_dataset | ||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser(description="") | ||
|
||
parser.add_argument( | ||
"--input-dataset-name", | ||
type=str, | ||
help="Text dataset from huggingface to tokenize", | ||
) | ||
parser.add_argument( | ||
"--output-dataset-name", | ||
type=str, | ||
help="Name of the tokenized dataset to upload to huggingface", | ||
) | ||
parser.add_argument( | ||
"--tokenizer-name", | ||
type=str, | ||
help="Name of the tokenizer from huggingface", | ||
) | ||
parser.add_argument( | ||
"--token", | ||
type=str, | ||
help="Hugging Face API token", | ||
) | ||
parser.add_argument( | ||
"--context-size", | ||
type=int, | ||
default=512, | ||
help="Context size of the tokenized dataset as input of the model", | ||
) | ||
parser.add_argument( | ||
"--batch-size", | ||
type=int, | ||
default=50, | ||
help="Batch size of text inputs into the tokenizer", | ||
) | ||
parser.add_argument( | ||
"--column-name", | ||
type=str, | ||
help="Name of the column containing text documents in the input dataset", | ||
) | ||
args = parser.parse_args() | ||
|
||
input_dataset = load_validation_dataset(f"delphi-suite/{args.input_dataset_name}") | ||
tokenizer = AutoTokenizer.from_pretrained(f"delphi-suite/{args.tokenizer_name}") | ||
|
||
if args.column_name: | ||
text_docs = input_dataset[args.column_name] | ||
else: | ||
if len(input_dataset.column_names) > 1: | ||
raise ValueError("There are more than one column in the specified dataset") | ||
text_docs = input_dataset[input_dataset.column_names[0]] | ||
|
||
output_dataset = Dataset.from_dict( | ||
{ | ||
"tokens": tokenize_dataset( | ||
text_docs, | ||
tokenizer, | ||
context_size=args.context_size, | ||
batch_size=args.batch_size, | ||
) | ||
} | ||
) | ||
|
||
output_dataset.push_to_hub( | ||
repo_id=f"delphi-suite/{args.output_dataset_name}", | ||
private=False, | ||
token=args.token, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
from collections import deque | ||
from typing import Optional | ||
|
||
from transformers import PreTrainedTokenizerBase | ||
|
||
|
||
def extend_deque( | ||
dq: deque[int], | ||
context_size: int, | ||
text_documents: list[str], | ||
doc_idx: int, | ||
tokenizer: PreTrainedTokenizerBase, | ||
batch_size: int, | ||
) -> int: | ||
""" | ||
Extends the deque with tokenized text documents until the deque grows large | ||
enough to reach the context size, or until all text documents are processed. | ||
The usage of a deque here aims to save the memory as opposed to | ||
load all the documents and tokenize them at once. | ||
Args: | ||
dq: Deque to extend with tokenized tokens. | ||
context_size: Size of the context(input sequences). | ||
text_documents: List of (untokenized) text documents to be tokenized. | ||
doc_idx: Index of the current text story. | ||
tokenizer: Tokenizer to encode the text strings. | ||
Returns: | ||
int: Updated index in the text documents dataset. | ||
""" | ||
while len(dq) < context_size and doc_idx < len(text_documents): | ||
text_doc = text_documents[doc_idx : doc_idx + batch_size] | ||
batch_input_ids = tokenizer( | ||
text_doc, return_attention_mask=False, add_special_tokens=False | ||
)["input_ids"] | ||
for input_ids in batch_input_ids: | ||
dq.extend(input_ids + [tokenizer.eos_token_id]) | ||
doc_idx += batch_size | ||
return doc_idx | ||
|
||
|
||
def make_new_samples( | ||
dq: deque[int], context_size: int, bos_token_id: int | ||
) -> list[list[int]]: | ||
""" | ||
Generates new samples for training by creating sequences of tokens | ||
from the deque until the deque does not hold enough tokens to generate | ||
another sample. | ||
Note: the model is unable to use the last token in an input sequence, | ||
so we repeat this token in the next input sequence. | ||
Args: | ||
dq: Deque containing tokenized tokens. | ||
context_size: Size of the context (input sequences). | ||
bos_token_id: bos_token_id of the tokenizer used. | ||
Returns: | ||
list[list[int]]: List of token sequences of the same length(context_size). | ||
""" | ||
|
||
samples = [] | ||
while len(dq) >= context_size: | ||
sample = [bos_token_id] | ||
|
||
# For the first (n-1) elements, pop from the left of the deque | ||
# and add to the new sample, the n-th element will be retained | ||
# in the deque for making the next sample. | ||
for _ in range(context_size - 1): | ||
sample.append(dq.popleft()) | ||
sample.append(dq[0]) | ||
|
||
samples.append(sample) | ||
return samples | ||
|
||
|
||
def tokenize_dataset( | ||
text_documents: list[str], | ||
tokenizer: PreTrainedTokenizerBase, | ||
context_size: int, | ||
batch_size: int, | ||
) -> list[list[int]]: | ||
""" | ||
Tokenizes the input text documents using the provided tokenizer and | ||
generates token sequences of the specified length. | ||
Args: | ||
text_documents: List[str], | ||
tokenizer, | ||
context_size, | ||
Returns: | ||
list[list[int]]: List of token sequences of length equal to context_size. | ||
""" | ||
|
||
dq = deque() | ||
doc_idx = 0 | ||
samples = [] | ||
|
||
while doc_idx < len(text_documents): | ||
doc_idx = extend_deque( | ||
dq, context_size, text_documents, doc_idx, tokenizer, batch_size | ||
) | ||
samples.extend(make_new_samples(dq, context_size, tokenizer.bos_token_id)) | ||
|
||
# We discard the last chunk, so no processing on the remainder of the deque here | ||
return samples |
Oops, something went wrong.