Merge branch 'main' into 50-evals-research-notebook-sample

delphi-suite · Mar 31, 2024 · f68825b · f68825b
2 parents 600a601 + bb5797f
commit f68825b
Show file tree

Hide file tree

Showing 48 changed files with 1,197 additions and 641 deletions.
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -4,14 +4,22 @@
     // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
     "version": "0.2.0",
     "configurations": [
+
         {
-            "name": "run_training 256",
+            "name": "run_training debug",
             "type": "debugpy",
             "request": "launch",
             "program": "scripts/run_training.py",
             "console": "integratedTerminal",
-            "args": "--debug --train_sample_limit=256"
-            //"args": "${command:pickArgs}"
+            "args": "--debug --loglevel 20"
+        },
+        {
+            "name": "run_training custom",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "scripts/run_training.py",
+            "console": "integratedTerminal",
+            "args": "${command:pickArgs}"
         },
         {
             "name": "run_training --help",

diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -8,4 +8,9 @@
     },
     "python.analysis.typeCheckingMode": "basic",
     "black-formatter.importStrategy": "fromEnvironment",
+    "python.testing.pytestArgs": [
+        "tests"
+    ],
+    "python.testing.unittestEnabled": false,
+    "python.testing.pytestEnabled": true,
 }
diff --git a/notebooks/end2end_demo.ipynb b/notebooks/end2end_demo.ipynb
@@ -9,7 +9,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -26,7 +26,7 @@
     "from delphi.eval.vis_per_token_model import visualize_per_token_category\n",
     "\n",
     "# from delphi.eval.calc_model_group_stats import calc_model_group_stats\n",
-    "from delphi.eval.token_labelling import TOKEN_LABELS"
+    "from delphi.eval.spacy_token_labelling import TOKEN_LABELS"
    ]
   },
   {
@@ -38,12 +38,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
     "# load data\n",
-    "tokenized_corpus_dataset = cast(Dataset, load_dataset(constants.tokenized_corpus_dataset))[\"validation\"]\n",
+    "tokenized_corpus_dataset = cast(Dataset, load_dataset(\n",
+    "    constants.tokenized_corpus_dataset,\n",
+    "    split=\"validation\"\n",
+    "))\n",
     "\n",
     "# TODO: convert to use static paths\n",
     "# with open(\"../src/delphi/eval/labelled_token_ids_dict.pkl\", \"rb\") as f:\n",
@@ -66,21 +69,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "0f8846898fbb4a1b9e872ff6511acd3d",
+       "model_id": "d6c18c9588f3499b94e89ccea5954780",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
        "VBox(children=(Dropdown(description='Token Category:', options=('Capitalized', 'Is Determiner', 'Is Interjunct…"
       ]
      },
-     "execution_count": 7,
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }

diff --git a/requirements-nocuda.txt b/requirements-nocuda.txt
@@ -21,6 +21,8 @@ wandb==0.16.3
 spacy==3.7.2
 pandas==1.3.4
 dacite==1.8.1
+panel==1.4.0
+jupyter_bokeh==4.0.1
 
 # temporarily installing transformers from main until 4.39.0 comes out (for mamba support)
 transformers @ git+https://github.com/huggingface/transformers@main

diff --git a/scripts/run_training.py b/scripts/run_training.py
@@ -13,7 +13,7 @@
 
 from delphi.constants import CONFIG_PRESETS_DIR
 from delphi.train.config import (
-    GigaConfig,
+    TrainingConfig,
     build_config_from_files_and_overrides,
     get_preset_paths,
     get_user_config_path,
@@ -208,7 +208,9 @@ def setup_parser() -> (
     )
     config_arg_group = parser.add_argument_group("Config arguments")
     help_parsers = dict()
-    add_dataclass_args_recursively(parser, GigaConfig, config_arg_group, help_parsers)
+    add_dataclass_args_recursively(
+        parser, TrainingConfig, config_arg_group, help_parsers
+    )
     add_preset_args(parser)
     add_logging_args(parser)
     return parser, help_parsers
@@ -232,7 +234,7 @@ def var_args_to_dict(config_vars: dict[str, Any]) -> dict[str, Any]:
 
 def args_to_dict(args: argparse.Namespace) -> dict[str, Any]:
     # at the toplevel, filter for args corresponding to field names in GigaConfig
-    field_names = set(field.name for field in fields(GigaConfig))
+    field_names = set(field.name for field in fields(TrainingConfig))
     config_vars = {
         k: v for k, v in vars(args).items() if k.split(".")[0] in field_names
     }

diff --git a/scripts/tokenize_dataset.py b/scripts/tokenize_dataset.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+
+import argparse
+
+from datasets import Dataset
+from transformers import AutoTokenizer
+
+from delphi.dataset.tokenization import tokenize_dataset
+from delphi.eval.utils import load_validation_dataset
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="")
+
+    parser.add_argument(
+        "--input-dataset-name",
+        type=str,
+        help="Text dataset from huggingface to tokenize",
+    )
+    parser.add_argument(
+        "--output-dataset-name",
+        type=str,
+        help="Name of the tokenized dataset to upload to huggingface",
+    )
+    parser.add_argument(
+        "--tokenizer-name",
+        type=str,
+        help="Name of the tokenizer from huggingface",
+    )
+    parser.add_argument(
+        "--token",
+        type=str,
+        help="Hugging Face API token",
+    )
+    parser.add_argument(
+        "--context-size",
+        type=int,
+        default=512,
+        help="Context size of the tokenized dataset as input of the model",
+    )
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=50,
+        help="Batch size of text inputs into the tokenizer",
+    )
+    parser.add_argument(
+        "--column-name",
+        type=str,
+        help="Name of the column containing text documents in the input dataset",
+    )
+    args = parser.parse_args()
+
+    input_dataset = load_validation_dataset(f"delphi-suite/{args.input_dataset_name}")
+    tokenizer = AutoTokenizer.from_pretrained(f"delphi-suite/{args.tokenizer_name}")
+
+    if args.column_name:
+        text_docs = input_dataset[args.column_name]
+    else:
+        if len(input_dataset.column_names) > 1:
+            raise ValueError("There are more than one column in the specified dataset")
+        text_docs = input_dataset[input_dataset.column_names[0]]
+
+    output_dataset = Dataset.from_dict(
+        {
+            "tokens": tokenize_dataset(
+                text_docs,
+                tokenizer,
+                context_size=args.context_size,
+                batch_size=args.batch_size,
+            )
+        }
+    )
+
+    output_dataset.push_to_hub(
+        repo_id=f"delphi-suite/{args.output_dataset_name}",
+        private=False,
+        token=args.token,
+    )
diff --git a/scripts/training_config_examples/sample_config.json b/scripts/training_config_examples/sample_config.json
@@ -16,9 +16,8 @@
   "batch_size": 64,
   "max_seq_len": 512,
   "model_config": {
-    "model_type": "llama2",
-    "mamba": null,
-    "llama2": {
+    "model_type": "LlamaForCausalLM",
+    "model_params": {
       "attention_bias": false,
       "attention_dropout": 0.0,
       "bos_token_id": -1,
@@ -52,7 +51,5 @@
     "decay_lr": true,
     "warmup_iters": 1000,
     "min_lr": 0.0
-  },
-  "train_sample_limit": -1,
-  "val_sample_limit": -1
+  }
 }
diff --git a/scripts/training_config_examples/sample_mamba.json b/scripts/training_config_examples/sample_mamba.json
@@ -16,8 +16,8 @@
   "batch_size": 64,
   "max_seq_len": 512,
   "model_config": {
-    "model_type": "mamba",
-    "mamba": {
+    "model_type": "MambaForCausalLM",
+    "model_params": {
       "vocab_size": 4096,
       "hidden_size": 768,
       "state_size": 16,
@@ -56,7 +56,5 @@
     "decay_lr": true,
     "warmup_iters": 1000,
     "min_lr": 0.0
-  },
-  "train_sample_limit": -1,
-  "val_sample_limit": -1
+  }
 }
diff --git a/scripts/training_config_examples/sample_transformers_bloom.json b/scripts/training_config_examples/sample_transformers_bloom.json
@@ -8,7 +8,7 @@
     "batch_size": 64,
     "model_config": {
         "model_type": "BloomForCausalLM",
-        "transformers_config": {
+        "model_params": {
             "apply_residual_connection_post_layernorm": false,
             "attention_dropout": 0.0,
             "bos_token_id": 1,

diff --git a/src/delphi/constants.py b/src/delphi/constants.py
@@ -4,4 +4,4 @@
 CONFIG_PRESETS_DIR = STATIC_ASSETS_DIR / "configs"
 
 CORPUS_DATASET = "delphi-suite/stories"
-TOKENIZED_CORPUS_DATASET = "delphi-suite/v0-tinystories-v2-clean-tokenized"
+TINYSTORIES_TOKENIZED_HF_DATASET = "delphi-suite/v0-tinystories-v2-clean-tokenized"
diff --git a/src/delphi/dataset/tokenization.py b/src/delphi/dataset/tokenization.py
@@ -0,0 +1,107 @@
+from collections import deque
+from typing import Optional
+
+from transformers import PreTrainedTokenizerBase
+
+
+def extend_deque(
+    dq: deque[int],
+    context_size: int,
+    text_documents: list[str],
+    doc_idx: int,
+    tokenizer: PreTrainedTokenizerBase,
+    batch_size: int,
+) -> int:
+    """
+    Extends the deque with tokenized text documents until the deque grows large
+    enough to reach the context size, or until all text documents are processed.
+
+    The usage of a deque here aims to save the memory as opposed to
+    load all the documents and tokenize them at once.
+
+    Args:
+        dq: Deque to extend with tokenized tokens.
+        context_size: Size of the context(input sequences).
+        text_documents: List of (untokenized) text documents to be tokenized.
+        doc_idx: Index of the current text story.
+        tokenizer: Tokenizer to encode the text strings.
+    Returns:
+        int: Updated index in the text documents dataset.
+    """
+    while len(dq) < context_size and doc_idx < len(text_documents):
+        text_doc = text_documents[doc_idx : doc_idx + batch_size]
+        batch_input_ids = tokenizer(
+            text_doc, return_attention_mask=False, add_special_tokens=False
+        )["input_ids"]
+        for input_ids in batch_input_ids:
+            dq.extend(input_ids + [tokenizer.eos_token_id])
+        doc_idx += batch_size
+    return doc_idx
+
+
+def make_new_samples(
+    dq: deque[int], context_size: int, bos_token_id: int
+) -> list[list[int]]:
+    """
+    Generates new samples for training by creating sequences of tokens
+    from the deque until the deque does not hold enough tokens to generate
+    another sample.
+
+    Note: the model is unable to use the last token in an input sequence,
+    so we repeat this token in the next input sequence.
+
+    Args:
+        dq: Deque containing tokenized tokens.
+        context_size: Size of the context (input sequences).
+        bos_token_id: bos_token_id of the tokenizer used.
+
+    Returns:
+        list[list[int]]: List of token sequences of the same length(context_size).
+    """
+
+    samples = []
+    while len(dq) >= context_size:
+        sample = [bos_token_id]
+
+        # For the first (n-1) elements, pop from the left of the deque
+        # and add to the new sample, the n-th element will be retained
+        # in the deque for making the next sample.
+        for _ in range(context_size - 1):
+            sample.append(dq.popleft())
+        sample.append(dq[0])
+
+        samples.append(sample)
+    return samples
+
+
+def tokenize_dataset(
+    text_documents: list[str],
+    tokenizer: PreTrainedTokenizerBase,
+    context_size: int,
+    batch_size: int,
+) -> list[list[int]]:
+    """
+    Tokenizes the input text documents using the provided tokenizer and
+    generates token sequences of the specified length.
+
+    Args:
+        text_documents: List[str],
+        tokenizer,
+        context_size,
+
+    Returns:
+        list[list[int]]: List of token sequences of length equal to context_size.
+    """
+
+    dq = deque()
+    doc_idx = 0
+    samples = []
+
+    while doc_idx < len(text_documents):
+        doc_idx = extend_deque(
+            dq, context_size, text_documents, doc_idx, tokenizer, batch_size
+        )
+        samples.extend(make_new_samples(dq, context_size, tokenizer.bos_token_id))
+
+    # We discard the last chunk, so no processing on the remainder of the deque here
+    return samples