Skip to content

Commit

Permalink
Merge branch 'main' into 50-evals-research-notebook-sample
Browse files Browse the repository at this point in the history
  • Loading branch information
menamerai authored Mar 31, 2024
2 parents 600a601 + bb5797f commit f68825b
Show file tree
Hide file tree
Showing 48 changed files with 1,197 additions and 641 deletions.
14 changes: 11 additions & 3 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,22 @@
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [

{
"name": "run_training 256",
"name": "run_training debug",
"type": "debugpy",
"request": "launch",
"program": "scripts/run_training.py",
"console": "integratedTerminal",
"args": "--debug --train_sample_limit=256"
//"args": "${command:pickArgs}"
"args": "--debug --loglevel 20"
},
{
"name": "run_training custom",
"type": "debugpy",
"request": "launch",
"program": "scripts/run_training.py",
"console": "integratedTerminal",
"args": "${command:pickArgs}"
},
{
"name": "run_training --help",
Expand Down
5 changes: 5 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,9 @@
},
"python.analysis.typeCheckingMode": "basic",
"black-formatter.importStrategy": "fromEnvironment",
"python.testing.pytestArgs": [
"tests"
],
"python.testing.unittestEnabled": false,
"python.testing.pytestEnabled": true,
}
17 changes: 10 additions & 7 deletions notebooks/end2end_demo.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -26,7 +26,7 @@
"from delphi.eval.vis_per_token_model import visualize_per_token_category\n",
"\n",
"# from delphi.eval.calc_model_group_stats import calc_model_group_stats\n",
"from delphi.eval.token_labelling import TOKEN_LABELS"
"from delphi.eval.spacy_token_labelling import TOKEN_LABELS"
]
},
{
Expand All @@ -38,12 +38,15 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# load data\n",
"tokenized_corpus_dataset = cast(Dataset, load_dataset(constants.tokenized_corpus_dataset))[\"validation\"]\n",
"tokenized_corpus_dataset = cast(Dataset, load_dataset(\n",
" constants.tokenized_corpus_dataset,\n",
" split=\"validation\"\n",
"))\n",
"\n",
"# TODO: convert to use static paths\n",
"# with open(\"../src/delphi/eval/labelled_token_ids_dict.pkl\", \"rb\") as f:\n",
Expand All @@ -66,21 +69,21 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "0f8846898fbb4a1b9e872ff6511acd3d",
"model_id": "d6c18c9588f3499b94e89ccea5954780",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"VBox(children=(Dropdown(description='Token Category:', options=('Capitalized', 'Is Determiner', 'Is Interjunct…"
]
},
"execution_count": 7,
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
Expand Down
2 changes: 2 additions & 0 deletions requirements-nocuda.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ wandb==0.16.3
spacy==3.7.2
pandas==1.3.4
dacite==1.8.1
panel==1.4.0
jupyter_bokeh==4.0.1

# temporarily installing transformers from main until 4.39.0 comes out (for mamba support)
transformers @ git+https://github.com/huggingface/transformers@main
Expand Down
8 changes: 5 additions & 3 deletions scripts/run_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

from delphi.constants import CONFIG_PRESETS_DIR
from delphi.train.config import (
GigaConfig,
TrainingConfig,
build_config_from_files_and_overrides,
get_preset_paths,
get_user_config_path,
Expand Down Expand Up @@ -208,7 +208,9 @@ def setup_parser() -> (
)
config_arg_group = parser.add_argument_group("Config arguments")
help_parsers = dict()
add_dataclass_args_recursively(parser, GigaConfig, config_arg_group, help_parsers)
add_dataclass_args_recursively(
parser, TrainingConfig, config_arg_group, help_parsers
)
add_preset_args(parser)
add_logging_args(parser)
return parser, help_parsers
Expand All @@ -232,7 +234,7 @@ def var_args_to_dict(config_vars: dict[str, Any]) -> dict[str, Any]:

def args_to_dict(args: argparse.Namespace) -> dict[str, Any]:
# at the toplevel, filter for args corresponding to field names in GigaConfig
field_names = set(field.name for field in fields(GigaConfig))
field_names = set(field.name for field in fields(TrainingConfig))
config_vars = {
k: v for k, v in vars(args).items() if k.split(".")[0] in field_names
}
Expand Down
78 changes: 78 additions & 0 deletions scripts/tokenize_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
#!/usr/bin/env python3

import argparse

from datasets import Dataset
from transformers import AutoTokenizer

from delphi.dataset.tokenization import tokenize_dataset
from delphi.eval.utils import load_validation_dataset

if __name__ == "__main__":
parser = argparse.ArgumentParser(description="")

parser.add_argument(
"--input-dataset-name",
type=str,
help="Text dataset from huggingface to tokenize",
)
parser.add_argument(
"--output-dataset-name",
type=str,
help="Name of the tokenized dataset to upload to huggingface",
)
parser.add_argument(
"--tokenizer-name",
type=str,
help="Name of the tokenizer from huggingface",
)
parser.add_argument(
"--token",
type=str,
help="Hugging Face API token",
)
parser.add_argument(
"--context-size",
type=int,
default=512,
help="Context size of the tokenized dataset as input of the model",
)
parser.add_argument(
"--batch-size",
type=int,
default=50,
help="Batch size of text inputs into the tokenizer",
)
parser.add_argument(
"--column-name",
type=str,
help="Name of the column containing text documents in the input dataset",
)
args = parser.parse_args()

input_dataset = load_validation_dataset(f"delphi-suite/{args.input_dataset_name}")
tokenizer = AutoTokenizer.from_pretrained(f"delphi-suite/{args.tokenizer_name}")

if args.column_name:
text_docs = input_dataset[args.column_name]
else:
if len(input_dataset.column_names) > 1:
raise ValueError("There are more than one column in the specified dataset")
text_docs = input_dataset[input_dataset.column_names[0]]

output_dataset = Dataset.from_dict(
{
"tokens": tokenize_dataset(
text_docs,
tokenizer,
context_size=args.context_size,
batch_size=args.batch_size,
)
}
)

output_dataset.push_to_hub(
repo_id=f"delphi-suite/{args.output_dataset_name}",
private=False,
token=args.token,
)
9 changes: 3 additions & 6 deletions scripts/training_config_examples/sample_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,8 @@
"batch_size": 64,
"max_seq_len": 512,
"model_config": {
"model_type": "llama2",
"mamba": null,
"llama2": {
"model_type": "LlamaForCausalLM",
"model_params": {
"attention_bias": false,
"attention_dropout": 0.0,
"bos_token_id": -1,
Expand Down Expand Up @@ -52,7 +51,5 @@
"decay_lr": true,
"warmup_iters": 1000,
"min_lr": 0.0
},
"train_sample_limit": -1,
"val_sample_limit": -1
}
}
8 changes: 3 additions & 5 deletions scripts/training_config_examples/sample_mamba.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@
"batch_size": 64,
"max_seq_len": 512,
"model_config": {
"model_type": "mamba",
"mamba": {
"model_type": "MambaForCausalLM",
"model_params": {
"vocab_size": 4096,
"hidden_size": 768,
"state_size": 16,
Expand Down Expand Up @@ -56,7 +56,5 @@
"decay_lr": true,
"warmup_iters": 1000,
"min_lr": 0.0
},
"train_sample_limit": -1,
"val_sample_limit": -1
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
"batch_size": 64,
"model_config": {
"model_type": "BloomForCausalLM",
"transformers_config": {
"model_params": {
"apply_residual_connection_post_layernorm": false,
"attention_dropout": 0.0,
"bos_token_id": 1,
Expand Down
2 changes: 1 addition & 1 deletion src/delphi/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@
CONFIG_PRESETS_DIR = STATIC_ASSETS_DIR / "configs"

CORPUS_DATASET = "delphi-suite/stories"
TOKENIZED_CORPUS_DATASET = "delphi-suite/v0-tinystories-v2-clean-tokenized"
TINYSTORIES_TOKENIZED_HF_DATASET = "delphi-suite/v0-tinystories-v2-clean-tokenized"
107 changes: 107 additions & 0 deletions src/delphi/dataset/tokenization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
from collections import deque
from typing import Optional

from transformers import PreTrainedTokenizerBase


def extend_deque(
dq: deque[int],
context_size: int,
text_documents: list[str],
doc_idx: int,
tokenizer: PreTrainedTokenizerBase,
batch_size: int,
) -> int:
"""
Extends the deque with tokenized text documents until the deque grows large
enough to reach the context size, or until all text documents are processed.
The usage of a deque here aims to save the memory as opposed to
load all the documents and tokenize them at once.
Args:
dq: Deque to extend with tokenized tokens.
context_size: Size of the context(input sequences).
text_documents: List of (untokenized) text documents to be tokenized.
doc_idx: Index of the current text story.
tokenizer: Tokenizer to encode the text strings.
Returns:
int: Updated index in the text documents dataset.
"""
while len(dq) < context_size and doc_idx < len(text_documents):
text_doc = text_documents[doc_idx : doc_idx + batch_size]
batch_input_ids = tokenizer(
text_doc, return_attention_mask=False, add_special_tokens=False
)["input_ids"]
for input_ids in batch_input_ids:
dq.extend(input_ids + [tokenizer.eos_token_id])
doc_idx += batch_size
return doc_idx


def make_new_samples(
dq: deque[int], context_size: int, bos_token_id: int
) -> list[list[int]]:
"""
Generates new samples for training by creating sequences of tokens
from the deque until the deque does not hold enough tokens to generate
another sample.
Note: the model is unable to use the last token in an input sequence,
so we repeat this token in the next input sequence.
Args:
dq: Deque containing tokenized tokens.
context_size: Size of the context (input sequences).
bos_token_id: bos_token_id of the tokenizer used.
Returns:
list[list[int]]: List of token sequences of the same length(context_size).
"""

samples = []
while len(dq) >= context_size:
sample = [bos_token_id]

# For the first (n-1) elements, pop from the left of the deque
# and add to the new sample, the n-th element will be retained
# in the deque for making the next sample.
for _ in range(context_size - 1):
sample.append(dq.popleft())
sample.append(dq[0])

samples.append(sample)
return samples


def tokenize_dataset(
text_documents: list[str],
tokenizer: PreTrainedTokenizerBase,
context_size: int,
batch_size: int,
) -> list[list[int]]:
"""
Tokenizes the input text documents using the provided tokenizer and
generates token sequences of the specified length.
Args:
text_documents: List[str],
tokenizer,
context_size,
Returns:
list[list[int]]: List of token sequences of length equal to context_size.
"""

dq = deque()
doc_idx = 0
samples = []

while doc_idx < len(text_documents):
doc_idx = extend_deque(
dq, context_size, text_documents, doc_idx, tokenizer, batch_size
)
samples.extend(make_new_samples(dq, context_size, tokenizer.bos_token_id))

# We discard the last chunk, so no processing on the remainder of the deque here
return samples
Loading

0 comments on commit f68825b

Please sign in to comment.