diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml index 36bb390d..4a1a0900 100644 --- a/.github/workflows/checks.yml +++ b/.github/workflows/checks.yml @@ -29,12 +29,11 @@ jobs: with: path: | ~/.cache/huggingface - key: ${{ runner.os }}-huggingface-cache-v1 # increment this key to invalidate the cache when new models/datasets are added + key: ${{ runner.os }}-hf-cache-v0.2 # increment this key to invalidate the cache when new models/datasets are added - name: dependencies run: | python -m pip install --upgrade pip - pip install -r requirements-nocuda.txt - pip install -e . + pip install -e .[dev,notebooks] - name: black run: black --check . - name: isort diff --git a/README.md b/README.md index f6569099..5d04ac5c 100644 --- a/README.md +++ b/README.md @@ -1,99 +1,196 @@ -# Delphi +# delphi -Interpreting Small Language Models Across Time and Scale +delphi is a set of tools for standardized and (mostly) reproducible training of small language models. You can use delphi to train a custom tokenizer, tokenize your dataset, and train your model. We build on top of HuggingFace, supporting every `CausalLM` architecture. Datasets, tokenizers and models (including checkpoints!) can be downloaded from and uploaded to HuggingFace automatically, with no need to manage local files. -# Training Models -See [`scripts/run_training.py`](scripts/run_training.py): -```bash - ./scripts/run_training.py --config_file /path/to/my/training/config.json + +# Setup + +1. Clone the repo +```shell +git clone https://github.com/delphi-suite/delphi.git +cd delphi +``` +2. Make & activate python >= 3.10 virtual env +```shell +python3.10 -m venv .venv +source .venv/bin/activate +``` +3. Install the project in editable state +`pip install -e .` +See `[project.optional-dependencies]` section in `pyproject.toml` for additional dependencies, e.g. you may want to `pip install -e ."[dev,mamba_cuda]"` +4. get your HuggingFace and W&B tokens and put them in the environment variables +```shell +export HF_TOKEN=... +export WANDB_API_KEY=... ``` -See [`scripts/sample_config.json`](scripts/sample_config.json) for an example of a training run json. +# Training a tokenizer -## Features -### Uploading to HuggingFace -With `huggingface.push_checkpoints_to_hub` set to `True`, the model and all associated -training run data will be uploaded to HuggingFace repo specified by `huggingface.repo_id` -every checkpoint. Every upload will be in a new folder named by the current iteration (e.g. `iter_1`). -### Resuming model training -With `init_from` set to `'resume'`, training will resume from `output_dir`. -### Deterministic, Reproducible* Training -Delphi aims to be deterministic and as reproducible as possible. However, there is one major caveat: hardware. CUDA algorithms are not always 100% isomorphic to CPU algorithms. We do record the hardware device type each training run uses, -to enable reproduction *given the same class of hardware*. -### Different Model Architectures -`model_config.model_type` can specify currently supported architectures. At time of writing, these are `'llama2'` and `'mamaba`'. Config for the selected model type should -be in `model_config.` (e.g. `model_config.llama2`) and correspond to the -arguments for that model type. See [`model_types.py`](src/delphi/train/config/models/model_types.py) -### Weights and Biases Integration +If you want to train a small and efficient model on a narrow dataset, then we recommend using a custom tokenizer with a small vocabulary. To train a reversible, GPT2-style, BPE tokenizer you can use `scripts/train_tokenizer.py`. +Script usage: -# Analyzing Models -TODO +``` +> scripts/train_tokenizer.py --help +usage: train_tokenizer.py [-h] --in-dataset IN_DATASET --feature FEATURE --split SPLIT + --vocab-size VOCAB_SIZE + [--out-dir OUT_DIR] [--out-repo OUT_REPO] + +Train a custom, reversible, BPE tokenizer (GPT2-like). You need to provide --out-repo or --out-dir. + +options: + -h, --help show this help message and exit + --in-dataset IN_DATASET, -i IN_DATASET + Dataset you want to train the tokenizer on. Local path or HF repo id + --feature FEATURE, -f FEATURE + Name of the feature (column) containing text documents in the input dataset + --split SPLIT, -s SPLIT + Split of the dataset to be used for tokenizer training, supports slicing like 'train[:10%]' + --vocab-size VOCAB_SIZE, -v VOCAB_SIZE + Vocabulary size of the tokenizer + --out-dir OUT_DIR Local directory to save the resulting tokenizer + --out-repo OUT_REPO HF repo id to upload the resulting tokenizer +``` -# Development +Here's how we trained the tokenizer for our `stories-*` suite of models. Please note that you can use single letter abbreviations for most arguments. + +``` +> scripts/train_tokenizer.py \ + --in-dataset delphi-suite/stories \ + --feature story \ + --split train \ + --vocab-size 4096 \ + --out-repo delphi-suite/stories-tokenizer +``` + +We use the only feature named `story` in the `train` split of [delphi-suite/stories](https://huggingface.co/datasets/delphi-suite/stories). We train a tokenizer with a vocabulary of 4096 tokens, and upload it to HF model repo [delphi-suite/stories-tokenizer](https://huggingface.co/delphi-suite/stories-tokenizer). -## Setup -1. Clone this repo and submodules: `git clone https://github.com/delphi-suite/delphi.git --recurse-submodules` -2. make python 3.10 virtual env in `.venv` -3. install dependencies `pip install -r requirements.txt` -4. install the project in editable state `pip install -e .` -5. run tests `pytest` +# Tokenizing a dataset -### Submodule Setup -If you cloned without `--recurse-submodules`, you can still install the submodules later with: -```bash -git submodule init -git submodule update +To turn a collection of text documents into sequences of tokens required for model training, you can use `scripts/tokenize_dataset.py`. All documents are tokenized and concatenated, with the `` token as a separator, e.g. ``` +doc1_tok1, doc1_tok2, ..., doc1_tokX, , doc2_tok1, doc2_tok2, ..., doc2_tokX, , doc3_tok1, ... +``` +Then this is divided into chunks, and the `` token is inserted at the begining of each chunk, e.g. +``` + doc1_tok1, doc1_tok2, ..., doc1_tokX, , doc2_tok1 + doc2_tok2, ..., doc2_tok511 + doc2_tok512, doc2_tok513, ..., doc2_tokX , doc3_tok1, ... +... +``` +It will produce sequences of specified size, by discarding the last chunk if it's too short. We don't use padding. -## Formatting +Script usage: -We're using black & isort to format the code. To make sure your changes adhere to the rules: +``` +> scripts/tokenize_dataset.py --help +usage: tokenize_dataset.py [-h] --in-dataset IN_DATASET --feature FEATURE --split SPLIT + --tokenizer TOKENIZER --seq-len SEQ_LEN + [--batch-size BATCH_SIZE] [--chunk-size CHUNK_SIZE] + [--out-dir OUT_DIR] [--out-repo OUT_REPO] + +Tokenize a text dataset using a specific tokenizer + +options: + -h, --help show this help message and exit + --in-dataset IN_DATASET, -i IN_DATASET + Dataset you want to tokenize. Local path or HF repo id + --feature FEATURE, -f FEATURE + Name of the feature (column) containing text documents in the input dataset + --split SPLIT, -s SPLIT + Split of the dataset to be tokenized, supports slicing like 'train[:10%]' + --tokenizer TOKENIZER, -t TOKENIZER + HF repo id or local directory containing the tokenizer + --seq-len SEQ_LEN, -l SEQ_LEN + Length of the tokenized sequences + --batch-size BATCH_SIZE, -b BATCH_SIZE + How many text documents to tokenize at once (default: 50) + --chunk-size CHUNK_SIZE, -c CHUNK_SIZE + Maximum number of tokenized sequences in a single parquet file (default: 200_000) + --out-dir OUT_DIR Local directory to save the resulting dataset + --out-repo OUT_REPO HF repo id to upload the resulting dataset +``` -1. follow setup instructions above -2. install pre-commit `pre-commit install` -3. install recommended vscode extensions +Here's how we tokenized the dataset for our `stories-*` suite of models. Please note that you can use single letter abbreviations for most arguments. -When you save a file vscode should automatically format it. Otherwise, pre-commit will do that, but you will need to add the changes and commit again. +For `train` split: +``` +> scripts/tokenize_dataset.py \ + --in-dataset delphi-suite/stories \ + --feature story \ + --split train \ + --tokenizer delphi-suite/stories-tokenizer \ + --seq-len 512 \ + --out-repo delphi-suite/stories-tokenized +``` +For `validation` split, repeated arguments omitted: +``` +> scripts/tokenize_dataset.py \ + ... + --split validation \ + ... +``` -## Pull Requests - -1. make a branch - - if it relates to an existing issue - - go to the issue page and click _Create a branch_ under _Development_ - - if the default name is not very long, keep it; otherwise, make it shorter, but keep the issue number in the front - - otherwise pick a short but descriptive name, a few hyphen-separated-words -2. make your changes - - include unit tests - - update README if needed - - if new huggingface datasets/models are added to testing, increment the cache number in `.github/workflows/checks.yml` -3. make a pull request - - if it isn't ready for review yet, mark it as draft - - check if CI is passing - - if the change is big, try to keep the commit history clean using interactive rebase - - don't push more often than it's needed, we're running github actions on a free tier - - if there were any changes to the main branch, rebase on top of it - - explain the change - - provide short description; focus on things that were not mentioned in the relevant issue - - comment important sections of the code in _Files changed_ tab - - when it's ready, add the relevant stakeholders as reviewers -4. after the comments are resolved and PR is approved, merge it using _Squash and merge_ - -## Incrementing Versions -When making a new release, increment the version in `delphi/__init__.py` +The input dataset is the same as in tokenizer training example above. We tokenize it with our custom [delphi-suite/stories-tokenizer](https://huggingface.co/delphi-suite/stories-tokenizer) into sequences of length 512. We upload it to HF dataset repo [delphi-suite/stories-tokenized](https://huggingface.co/datasets/delphi-suite/stories-tokenized). + +Please note that you can use any HuggingFace tokenizer, you don't need to train a custom one. + +# Training a model + +To train a model, you'll need to create a config file. For examples see `configs/`, and for field descriptions see `delphi/train/config/training_config.py`. The training script is located in `scripts/train_model.py`. + +Script usage: + +``` +> scripts/train_model.py --help +usage: train_model.py [-h] [--overrides [OVERRIDES ...]] [-v | -s] [config_files ...] + +Train a delphi model + +positional arguments: + config_files Path to json file(s) containing config values, e.g. 'primary_config.json secondary_config.json'. + +options: + -h, --help show this help message and exit + --overrides [OVERRIDES ...] + Override config values with space-separated declarations. e.g. `--overrides model_config.hidden_size=42 run_name=foo` + -v, --verbose Increase verbosity level, repeatable (e.g. -vvv). Mutually exclusive with --silent, --loglevel + -s, --silent Silence all logging. Mutually exclusive with --verbose, --loglevel +``` + +You can specify primary config and secondary config, which is useful if you're training a suite of models that only differ in a few parameters. Additionally, you can override specific fields using the `--overrides` flag. If you don't want to push the model and its checkpoints to HF, you need to explicitly set `out_repo=""`. If you don't want to log to W&B, you need to set `wandb=""`. Please note that by default we save the optimizer state (2x model size) with every checkpoint. + +Here is how we trained our `stories-mamba-100k` model +``` +> scripts/train_model.py \ + configs/stories/mamba/base.json \ + configs/stories/mamba/100k.json \ + --overrides \ + out_repo="delphi-suite/stories-mamba-100k" \ + wandb="delphi-suite/delphi" +``` + +# Development + +1. Install the `dev` and `notebooks` dependencies `pip install -e ."[dev,notebooks]"`. +2. Run the tests `pytest`. +3. Install pre-commit `pre-commit install`. +4. Install the recommended vscode extensions. + +When you save a file vscode should automatically format it. Otherwise, pre-commit will do that, but you will need to add the changes and commit again. # Citation -If you use `delphi` in your research, please cite using the following +If you use delphi in your research, please cite using the following ```bibtex @software{delphi, title = {delphi: small language models training made easy}, - author = {Jett Janiak, Jai Dhyani, Jannik Brinkmann, Gonçalo Paulo, Joshua Wendland, Víctor Abia Alonso, Siwei Li, Rai (Phan Anh Duong), Alice Rigg}, + author = {Jett Janiak, Jai Dhyani, Jannik Brinkmann, Gonçalo Paulo, Joshua Wendland, Víctor Abia Alonso, Siwei Li, Phan Anh Duong, Alice Rigg}, year = 2024, url = {https://github.com/delphi-suite/delphi}, license = {apache-2.0} } -``` +``` \ No newline at end of file diff --git a/configs/debug.json b/configs/debug.json deleted file mode 100644 index bdfd6308..00000000 --- a/configs/debug.json +++ /dev/null @@ -1,20 +0,0 @@ -{ - "max_seq_len": 512, - "max_epochs": 2, - "eval_iters": 1, - "batch_ordering_seed": 42, - "torch_seed": 1337, - "batch_size": 64, - "model_config": { - "model_class": "LlamaForCausalLM", - "hidden_size": 48, - "intermediate_size": 48, - "num_attention_heads": 2, - "num_hidden_layers": 2, - "num_key_value_heads": 2, - "vocab_size": 4096 - }, - "dataset": { - "name": "delphi-suite/v0-tinystories-v2-clean-tokenized" - } -} \ No newline at end of file diff --git a/configs/sample_config.json b/configs/sample_config.json deleted file mode 100644 index ac538399..00000000 --- a/configs/sample_config.json +++ /dev/null @@ -1,52 +0,0 @@ -{ - "run_name": "2024_03_15_17_28_14", - "output_dir": "/Users/jaidhyani/Library/Application Support/delphi", - "dataset": { - "name": "delphi-suite/v0-tinystories-v2-clean-tokenized" - }, - "device": "auto", - "log_interval": 1, - "eval_iters": 100, - "wandb": { - "project": "delphi", - "entity": "set_wandb.entity_to_your_wandb_username_to_make_wandb_logging_work" - }, - "batch_size": 64, - "max_seq_len": 512, - "model_config": { - "model_class": "LlamaForCausalLM", - "attention_bias": false, - "attention_dropout": 0.0, - "bos_token_id": -1, - "eos_token_id": -2, - "hidden_act": "silu", - "hidden_size": 288, - "initializer_range": 0.02, - "intermediate_size": 288, - "max_position_embeddings": 512, - "num_attention_heads": 6, - "num_hidden_layers": 6, - "num_key_value_heads": 6, - "pretraining_tp": 1, - "rms_norm_eps": 1e-06, - "rope_scaling": null, - "rope_theta": 10000.0, - "tie_word_embeddings": false, - "use_cache": true, - "vocab_size": 4096 - }, - "max_epochs": 10, - "gradient_accumulation_steps": 1, - "grad_clip": 1.0, - "adam": { - "learning_rate": 0.0005, - "weight_decay": 0.1, - "beta1": 0.9, - "beta2": 0.95, - "decay_lr": true, - "warmup_iters": 1000, - "min_lr": 0.0 - }, - "batch_ordering_seed": 42, - "torch_seed": 1337 -} \ No newline at end of file diff --git a/configs/sample_mamba.json b/configs/sample_mamba.json deleted file mode 100644 index 7fddcb26..00000000 --- a/configs/sample_mamba.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "max_seq_len": 512, - "max_epochs": 2, - "log_interval": 1, - "eval_iters": 10, - "batch_size": 8, - "model_config": { - "model_class": "MambaForCausalLM", - "vocab_size": 4096, - "hidden_size": 48, - "state_size": 16, - "num_hidden_layers": 2, - "conv_kernel": 2, - "expand": 2, - "time_step_rank": 2 - }, - "batch_ordering_seed": 42, - "torch_seed": 1337, - "dataset": { - "name": "delphi-suite/v0-tinystories-v2-clean-tokenized" - } -} \ No newline at end of file diff --git a/configs/sample_transformers_bloom.json b/configs/sample_transformers_bloom.json deleted file mode 100644 index 793f6a8a..00000000 --- a/configs/sample_transformers_bloom.json +++ /dev/null @@ -1,28 +0,0 @@ -{ - "max_seq_len": 512, - "max_epochs": 2, - "eval_iters": 1, - "batch_size": 64, - "model_config": { - "model_class": "BloomForCausalLM", - "apply_residual_connection_post_layernorm": false, - "attention_dropout": 0.0, - "bos_token_id": 1, - "eos_token_id": 2, - "hidden_dropout": 0.0, - "hidden_size": 8, - "initializer_range": 0.02, - "layer_norm_epsilon": 1e-05, - "n_head": 2, - "n_layer": 2, - "pretraining_tp": 1, - "slow_but_exact": false, - "use_cache": true, - "vocab_size": 4096 - }, - "batch_ordering_seed": 42, - "torch_seed": 1337, - "dataset": { - "name": "delphi-suite/v0-tinystories-v2-clean-tokenized" - } -} \ No newline at end of file diff --git a/configs/stories/llama2/README.md b/configs/stories/llama2/README.md index be1f976e..6192ef38 100644 --- a/configs/stories/llama2/README.md +++ b/configs/stories/llama2/README.md @@ -1,7 +1,6 @@ -not using padding, so pad_token_id not set -use_cache - using default -pretraining_tp - experimental parallelization we're not using, which is the default -tie_word_embeddings - llama2 used False and this is better for interpretability, note that llama2.c is using True by default, which is probably more efficient use of parameters for very small models -rope settings are widely used defaults -attention_bias - no biases on QKV and output projection is the default and that's what we're using -attention_dropout - this is the only dropout llama2 can use, it's set to prob=0 by default and that's what we're using \ No newline at end of file +- use_cache - using default +- pretraining_tp - experimental parallelization we're not using, which is the default +- tie_word_embeddings - llama2 used False and this is better for interpretability, note that llama2.c is using True by default, which is probably more efficient use of parameters for very small models +- rope settings are widely used defaults +- attention_bias - no biases on QKV and output projection is the default and that's what we're using +- attention_dropout - this is the only dropout llama2 can use, it's set to prob=0 by default and that's what we're using \ No newline at end of file diff --git a/configs/stories/llama2/base.json b/configs/stories/llama2/base.json index 4a2394d5..870ddf2f 100644 --- a/configs/stories/llama2/base.json +++ b/configs/stories/llama2/base.json @@ -48,7 +48,7 @@ "batch_ordering_seed": 1337, "torch_seed": 42, "dataset": { - "name": "delphi-suite/stories-tokenized" + "path": "delphi-suite/stories-tokenized" }, "tokenizer": "delphi-suite/stories-tokenizer" } \ No newline at end of file diff --git a/configs/stories/mamba/README.md b/configs/stories/mamba/README.md index 3e83bccc..7e30ceb7 100644 --- a/configs/stories/mamba/README.md +++ b/configs/stories/mamba/README.md @@ -1,10 +1,8 @@ -pad_token_id - we're not using pad tokens, do we don't set it -layer_norm_eps - different than rms norm eps in mamba -initializer_range - different in mamba & llama -residual_in_fp32 - mamba specific parameter -time_step_* - mamba specific, sane defaults -there is no way to untie embeddings and unembeddings in mamba, they're tied by default -https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/models/mamba/modeling_mamba.py#L602-L610 -rescale_prenorm_residual was True in original paper, so we set it to True, despite HF default being false -using default for use_cache -state_size is default \ No newline at end of file +- layer_norm_eps - different than rms norm eps in llama +- initializer_range - different in mamba & llama +- residual_in_fp32 - mamba specific parameter +- time_step_* - mamba specific, sane defaults +- there is no way to untie embeddings and unembeddings in mamba, they're tied by default https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/models/mamba/modeling_mamba.py#L602-L610 +- rescale_prenorm_residual was True in original paper, so we set it to True, despite HF default being false +- using default for use_cache +- state_size is default \ No newline at end of file diff --git a/configs/stories/mamba/base.json b/configs/stories/mamba/base.json index 4e151a61..83bac807 100644 --- a/configs/stories/mamba/base.json +++ b/configs/stories/mamba/base.json @@ -49,7 +49,7 @@ "batch_ordering_seed": 1337, "torch_seed": 42, "dataset": { - "name": "delphi-suite/stories-tokenized" + "path": "delphi-suite/stories-tokenized" }, "tokenizer": "delphi-suite/stories-tokenizer" } \ No newline at end of file diff --git a/src/delphi/constants.py b/delphi/__init__.py similarity index 53% rename from src/delphi/constants.py rename to delphi/__init__.py index c86e97fb..0f0ca210 100644 --- a/src/delphi/constants.py +++ b/delphi/__init__.py @@ -2,7 +2,9 @@ from pathlib import Path from typing import cast -TEST_CONFIGS_DIR = cast(Path, files("delphi.test_configs")) +from beartype.claw import beartype_this_package # <-- hype comes + +beartype_this_package() # <-- hype goes -CORPUS_DATASET = "delphi-suite/stories" -TINYSTORIES_TOKENIZED_HF_DATASET = "delphi-suite/v0-tinystories-v2-clean-tokenized" +__version__ = "0.2" +TEST_CONFIGS_DIR = cast(Path, files("delphi.test_configs")) diff --git a/delphi/eval.py b/delphi/eval.py new file mode 100644 index 00000000..c7ffaf2c --- /dev/null +++ b/delphi/eval.py @@ -0,0 +1,379 @@ +import math +import random +import uuid +from typing import Any, Optional, cast + +import numpy as np +import panel as pn +import plotly.graph_objects as go +import torch +from datasets import Dataset +from IPython.core.display import HTML +from IPython.core.display_functions import display +from jaxtyping import Float, Int +from transformers import PreTrainedTokenizerBase + + +def single_loss_diff_to_color(loss_diff: float) -> str: + # if loss_diff is negative, we want the color to be red + # if loss_diff is positive, we want the color to be green + # if loss_diff is 0, we want the color to be white + # the color should be more intense the larger the absolute value of loss_diff + + def sigmoid(x: float) -> float: + return 1 / (1 + math.exp(-x)) + + scaled_loss_diff = sigmoid(loss_diff) # scale to 0-1 + + if scaled_loss_diff < 0.5: # red + red_val = 255 + green_blue_val = min(int(255 * 2 * scaled_loss_diff), 255) + return f"rgb({red_val}, {green_blue_val}, {green_blue_val})" + else: # green + green_val = 255 + red_blue_val = min(int(255 * 2 * (1 - scaled_loss_diff)), 255) + return f"rgb({red_blue_val}, {green_val}, {red_blue_val})" + + +def token_to_html( + token: int, + tokenizer: PreTrainedTokenizerBase, + bg_color: str, + data: dict, + class_name: str = "token", +) -> str: + data = data or {} # equivalent to if not data: data = {} + # non-breakable space, w/o it leading spaces wouldn't be displayed + str_token = tokenizer.decode(token).replace(" ", " ") + + # background or user-select (for \n) goes here + specific_styles = {} + # for now just adds line break or doesn't + br = "" + + if bg_color: + specific_styles["background-color"] = bg_color + if str_token == "\n": + # replace new line character with two characters: \ and n + str_token = r"\n" + # add line break in html + br += "
" + # this is so we can copy the prompt without "\n"s + specific_styles["user-select"] = "none" + str_token = str_token.replace("<", "<").replace(">", ">") + + style_str = data_str = "" + # converting style dict into the style attribute + if specific_styles: + inside_style_str = "; ".join(f"{k}: {v}" for k, v in specific_styles.items()) + style_str = f" style='{inside_style_str}'" + if data: + data_str = "".join( + f" data-{k}='{v.replace(' ', ' ')}'" for k, v in data.items() + ) + return f"
{str_token}
{br}" + + +_token_style = { + "border": "1px solid #888", + "display": "inline-block", + # each character of the same width, so we can easily spot a space + "font-family": "monospace", + "font-size": "14px", + "color": "black", + "background-color": "white", + "margin": "1px 0px 1px 1px", + "padding": "0px 1px 1px 1px", +} +_token_emphasized_style = { + "border": "3px solid #888", + "display": "inline-block", + "font-family": "monospace", + "font-size": "14px", + "color": "black", + "background-color": "white", + "margin": "1px 0px 1px 1px", + "padding": "0px 1px 1px 1px", +} +_token_style_str = " ".join([f"{k}: {v};" for k, v in _token_style.items()]) +_token_emphasized_style_str = " ".join( + [f"{k}: {v};" for k, v in _token_emphasized_style.items()] +) + + +def vis_pos_map( + pos_list: list[tuple[int, int]], + selected_tokens: list[int], + metrics: Float[torch.Tensor, "prompt pos"], + token_ids: Int[torch.Tensor, "prompt pos"], + tokenizer: PreTrainedTokenizerBase, +): + """ + Randomly sample from pos_map and visualize the loss diff at the corresponding position. + """ + + token_htmls = [] + unique_id = str(uuid.uuid4()) + token_class = f"pretoken_{unique_id}" + selected_token_class = f"token_{unique_id}" + hover_div_id = f"hover_info_{unique_id}" + + # choose a random keys from pos_map + key = random.choice(pos_list) + + prompt, pos = key + all_toks = token_ids[prompt][: pos + 1] + + for i in range(all_toks.shape[0]): + token_id = cast(int, all_toks[i].item()) + value = metrics[prompt][i].item() + token_htmls.append( + token_to_html( + token_id, + tokenizer, + bg_color="white" + if np.isnan(value) + else single_loss_diff_to_color(value), + data={"loss-diff": f"{value:.2f}"}, + class_name=token_class + if token_id not in selected_tokens + else selected_token_class, + ) + ) + + # add break line + token_htmls.append("

") + + html_str = f""" + + {"".join(token_htmls)}
+ + """ + display(HTML(html_str)) + + +def token_selector( + vocab_map: dict[str, int] +) -> tuple[pn.widgets.MultiChoice, list[int]]: + tokens = list(vocab_map.keys()) + token_selector_ = pn.widgets.MultiChoice(name="Tokens", options=tokens) + token_ids = [vocab_map[token] for token in cast(list[str], token_selector_.value)] + + def update_tokens(event): + token_ids.clear() + token_ids.extend([vocab_map[token] for token in event.new]) + + token_selector_.param.watch(update_tokens, "value") + return token_selector_, token_ids + + +def calc_model_group_stats( + tokenized_corpus_dataset: Dataset, + logprobs_by_dataset: dict[str, torch.Tensor], + selected_tokens: list[int], +) -> dict[str, dict[str, float]]: + """ + For each (model, token group) pair, calculate useful stats (for visualization) + + args: + - tokenized_corpus_dataset: a list of the tokenized corpus datasets, e.g. load_dataset(constants.tokenized_corpus_dataset))["validation"] + - logprob_datasets: a dict of lists of logprobs, e.g. {"llama2": load_dataset("transcendingvictor/llama2-validation-logprobs")["validation"]["logprobs"]} + - selected_tokens: a list of selected token IDs, e.g. [46, 402, ...] + + returns: a dict of model names as keys and stats dict as values + e.g. {"100k": {"mean": -0.5, "median": -0.4, "min": -0.1, "max": -0.9, "25th": -0.3, "75th": -0.7}, ...} + + Stats calculated: mean, median, min, max, 25th percentile, 75th percentile + """ + model_group_stats = {} + for model in logprobs_by_dataset: + model_logprobs = [] + print(f"Processing model {model}") + dataset = logprobs_by_dataset[model] + for ix_doc_lp, document_lps in enumerate(dataset): + tokens = tokenized_corpus_dataset[ix_doc_lp]["tokens"] + for ix_token, token in enumerate(tokens): + if ix_token == 0: # skip the first token, which isn't predicted + continue + logprob = document_lps[ix_token].item() + if token in selected_tokens: + model_logprobs.append(logprob) + + if model_logprobs: + model_group_stats[model] = { + "mean": np.mean(model_logprobs), + "median": np.median(model_logprobs), + "min": np.min(model_logprobs), + "max": np.max(model_logprobs), + "25th": np.percentile(model_logprobs, 25), + "75th": np.percentile(model_logprobs, 75), + } + return model_group_stats + + +def dict_filter_quantile( + d: dict[Any, float], q_start: float, q_end: float +) -> dict[Any, float]: + if not (0 <= q_start < q_end <= 1): + raise ValueError("Invalid quantile range") + q_start_val = np.nanquantile(list(d.values()), q_start) + q_end_val = np.nanquantile(list(d.values()), q_end) + return { + k: v for k, v in d.items() if q_start_val <= v <= q_end_val and not np.isnan(v) + } + + +def get_all_tok_metrics_in_label( + token_ids: Int[torch.Tensor, "prompt pos"], + selected_tokens: list[int], + metrics: torch.Tensor, + q_start: Optional[float] = None, + q_end: Optional[float] = None, +) -> dict[tuple[int, int], float]: + """ + From the token_map, get all the positions of the tokens that have a certain label. + We don't use the token_map because for sampling purposes, iterating through token_ids is more efficient. + Optionally, filter the tokens based on the quantile range of the metrics. + + Args: + - token_ids (Dataset): token_ids dataset e.g. token_ids[0] = {"tokens": [[1, 2, ...], [2, 5, ...], ...]} + - selected_tokens (list[int]): list of token IDs to search for e.g. [46, 402, ...] + - metrics (torch.Tensor): tensor of metrics to search through e.g. torch.tensor([[0.1, 0.2, ...], [0.3, 0.4, ...], ...]) + - q_start (float): the start of the quantile range to filter the metrics e.g. 0.1 + - q_end (float): the end of the quantile range to filter the metrics e.g. 0.9 + + Returns: + - tok_positions (dict[tuple[int, int], Number]): dictionary of token positions and their corresponding metrics + """ + + # check if metrics have the same dimensions as token_ids + if metrics.shape != token_ids.shape: + raise ValueError( + f"Expected metrics to have the same shape as token_ids, but got {metrics.shape} and {token_ids.shape} instead." + ) + + tok_positions = {} + for prompt_pos, prompt in enumerate(token_ids.numpy()): + for tok_pos, tok in enumerate(prompt): + if tok in selected_tokens: + tok_positions[(prompt_pos, tok_pos)] = metrics[ + prompt_pos, tok_pos + ].item() + + if q_start is not None and q_end is not None: + tok_positions = dict_filter_quantile(tok_positions, q_start, q_end) + + return tok_positions + + +def visualize_selected_tokens( + input: dict[str | int, tuple[float, float, float]], + log_scale=False, + line_metric="Means", + checkpoint_mode=True, + shade_color="rgba(68, 68, 68, 0.3)", + line_color="rgb(31, 119, 180)", + bar_color="purple", + marker_color="SkyBlue", + background_color="AliceBlue", +) -> go.FigureWidget: + input_x = list(input.keys()) + + def get_hovertexts(mid: np.ndarray, lo: np.ndarray, hi: np.ndarray) -> list[str]: + return [f"Loss: {m:.3f} ({l:.3f}, {h:.3f})" for m, l, h in zip(mid, lo, hi)] + + def get_plot_values() -> tuple[np.ndarray, np.ndarray, np.ndarray]: + x = np.array([input[x] for x in input_x]).T + means, err_lo, err_hi = x[0], x[1], x[2] + return means, err_lo, err_hi + + means, err_lo, err_hi = get_plot_values() + + if checkpoint_mode: + scatter_plot = go.Figure( + [ + go.Scatter( + name="Upper Bound", + x=input_x, + y=means + err_hi, + mode="lines", + marker=dict(color=shade_color), + line=dict(width=0), + showlegend=False, + ), + go.Scatter( + name="Lower Bound", + x=input_x, + y=means - err_lo, + marker=dict(color=shade_color), + line=dict(width=0), + mode="lines", + fillcolor=shade_color, + fill="tonexty", + showlegend=False, + ), + go.Scatter( + name=line_metric, + x=input_x, + y=means, + mode="lines", + marker=dict( + color=line_color, + size=0, + line=dict(color=line_color, width=1), + ), + ), + ] + ) + else: + scatter_plot = go.Scatter( + x=input_x, + y=means, + error_y=dict( + type="data", + symmetric=False, + array=err_hi, + arrayminus=err_lo, + color=bar_color, + ), + marker=dict( + color=marker_color, + size=15, + line=dict(color=line_color, width=2), + ), + hovertext=get_hovertexts(means, err_lo, err_hi), + hoverinfo="text+x", + ) + g = go.FigureWidget( + data=scatter_plot, + layout=go.Layout( + yaxis=dict( + title="Loss", + type="log" if log_scale else "linear", + ), + plot_bgcolor=background_color, + ), + ) + + return g diff --git a/src/delphi/dataset/__init__.py b/delphi/test_configs/__init__.py similarity index 100% rename from src/delphi/dataset/__init__.py rename to delphi/test_configs/__init__.py diff --git a/src/delphi/test_configs/debug.json b/delphi/test_configs/debug.json similarity index 83% rename from src/delphi/test_configs/debug.json rename to delphi/test_configs/debug.json index 03d5c35d..f6e3360a 100644 --- a/src/delphi/test_configs/debug.json +++ b/delphi/test_configs/debug.json @@ -15,7 +15,8 @@ "vocab_size": 4096 }, "dataset": { - "name": "delphi-suite/v0-tinystories-v2-clean-tokenized" + "path": "delphi-suite/stories-tokenized" }, - "out_repo_id": "" + "out_repo": "", + "wandb": "" } \ No newline at end of file diff --git a/src/delphi/dataset/tokenization.py b/delphi/tokenization.py similarity index 100% rename from src/delphi/dataset/tokenization.py rename to delphi/tokenization.py diff --git a/src/delphi/eval/__init__.py b/delphi/train/__init__.py similarity index 100% rename from src/delphi/eval/__init__.py rename to delphi/train/__init__.py diff --git a/src/delphi/train/checkpoint_step.py b/delphi/train/checkpoint_step.py similarity index 100% rename from src/delphi/train/checkpoint_step.py rename to delphi/train/checkpoint_step.py diff --git a/src/delphi/train/config/__init__.py b/delphi/train/config/__init__.py similarity index 85% rename from src/delphi/train/config/__init__.py rename to delphi/train/config/__init__.py index fe0e825b..dc5a504d 100644 --- a/src/delphi/train/config/__init__.py +++ b/delphi/train/config/__init__.py @@ -6,4 +6,3 @@ dot_notation_to_dict, get_user_config_path, ) -from .wandb_config import WandbConfig diff --git a/src/delphi/train/config/adam_config.py b/delphi/train/config/adam_config.py similarity index 100% rename from src/delphi/train/config/adam_config.py rename to delphi/train/config/adam_config.py diff --git a/delphi/train/config/dataset_config.py b/delphi/train/config/dataset_config.py new file mode 100644 index 00000000..8f3fbe47 --- /dev/null +++ b/delphi/train/config/dataset_config.py @@ -0,0 +1,35 @@ +from dataclasses import dataclass, field + +from beartype import beartype +from datasets import Dataset + +from delphi import utils + + +@beartype +@dataclass(frozen=True) +class DatasetConfig: + # tokenized dataset; HF repo id or local directory + path: str + + # feature in the dataset; should be a list of <= max_seq_len token ints + feature: str = "tokens" + + # split of the dataset to use for training + train_split: str = "train" + + # split of the dataset to use for validation + validation_split: str = "validation" + + def _load(self, split) -> Dataset: + ds = utils.load_dataset_split_sequence_int32_feature( + self.path, split, self.feature + ) + ds.set_format("torch") + return ds + + def load_train(self) -> Dataset: + return self._load(self.train_split) + + def load_validation(self) -> Dataset: + return self._load(self.validation_split) diff --git a/src/delphi/train/config/debug_config.py b/delphi/train/config/debug_config.py similarity index 100% rename from src/delphi/train/config/debug_config.py rename to delphi/train/config/debug_config.py diff --git a/delphi/train/config/training_config.py b/delphi/train/config/training_config.py new file mode 100644 index 00000000..1e4d3730 --- /dev/null +++ b/delphi/train/config/training_config.py @@ -0,0 +1,82 @@ +import os +from dataclasses import dataclass, field +from datetime import datetime +from typing import Any, Optional + +import platformdirs +from beartype import beartype + +from .adam_config import AdamConfig +from .dataset_config import DatasetConfig +from .debug_config import DebugConfig + + +@beartype +@dataclass(frozen=True, kw_only=True) +class TrainingConfig: + # model config; class_name=name of model class in transformers, everything else is kwargs for the corresponding model config + model_config: dict[str, Any] + + max_seq_len: int + run_name: str = datetime.now().strftime("%Y_%m_%d_%H_%M_%S") + out_dir: str = os.path.join(platformdirs.user_data_dir(appname="delphi"), run_name) + + # device to use (cuda, mps, cpu) + device: str = "auto" + + # checkpoint every N iters + checkpoint_interval: int = 2000 + + # manually list iterations to save checkpoints on + extra_checkpoint_iters: list[int] = field(default_factory=list) + + # log to the console every N iters; this doesn't control wandb logging which is done only on checkpoints + log_interval: int = 1 + + # FIXME: there is a bug in the current implementation, and eval loss is computed on the + # entire dataset. In this implementation, eval_iters controls the number of minibatches + # the dataset is split into for evaluation. + eval_iters: int = 100 + + # path to a checkpoint to resume from + resume_from_path: Optional[str] = None + + # number of samples used to compute the gradient for a single optimizer step + batch_size: int = 64 + + # total number of training epochs + max_epochs: int = 10 + + # clip gradients at this value, or disable if == 0.0 + grad_clip: float = 1.0 + + # if > 1 reduces memory usage by computing gradient in microbatches + gradient_accumulation_steps: int = 1 + + # AdamW optimizer + adam: AdamConfig = field(default_factory=AdamConfig) + + # seed used for pseudorandomly sampling data during training + batch_ordering_seed: int + + # seed used for torch + torch_seed: int + + # whether to save the optimizer state with each checkpoint + # this is twice as large as the model, but allows to resume training in a reproducible way + save_optimizer: bool = True + + # specify training and validation data + dataset: DatasetConfig + + # HF repo id or local directory containing the tokenizer. Used only to upload it to HF with the model, not for training + tokenizer: str = "" + + # wandb config in 'entity/project' form. Set to empty string to not use wandb. + wandb: str + + # HF repo id. Set to empty string to not push to repo. + out_repo: str + + # debug config + debug_config: DebugConfig = field(default_factory=DebugConfig) diff --git a/src/delphi/train/config/utils.py b/delphi/train/config/utils.py similarity index 81% rename from src/delphi/train/config/utils.py rename to delphi/train/config/utils.py index b645cd5e..7fda9cf1 100644 --- a/src/delphi/train/config/utils.py +++ b/delphi/train/config/utils.py @@ -65,28 +65,6 @@ def build_config_dict_from_files(config_files: list[Path]) -> dict[str, Any]: return combined_config -def set_backup_vals(config: dict[str, Any], config_files: list[Path]): - """ - Convenience default values for run_name and output_dir based on config file (if exactly one passed) - - If the user is using 1 config file and has not set a run_name, we set it to the filename. - Likewise for output_dir, we set it to a user-specific directory based on the run_name. - """ - if len(config_files) == 1: - prefix = f"{config_files[0].stem}__" - else: - prefix = "" - if "run_name" not in config: - run_time = datetime.now().strftime("%Y_%m_%d_%H_%M_%S") - config["run_name"] = f"{prefix}{run_time}" - logging.info(f"Setting run_name to {config['run_name']}") - if "output_dir" not in config: - config["output_dir"] = os.path.join( - platformdirs.user_data_dir(appname="delphi"), config["run_name"] - ) - logging.info(f"Setting output_dir to {config['output_dir']}") - - def cast_types(config: dict[str, Any], target_dataclass: Type): """ user overrides are passed in as strings, so we need to cast them to the correct type @@ -118,13 +96,11 @@ def build_config_from_files_and_overrides( (we expect this to be passed as strings w/o type hints from a script argument: e.g. `--overrides model_config.hidden_size=42 run_name=foo`) 3. Merge in overrides to config_dict, taking precedence over all config_files values. - 4. Set backup values (for run_name and output_dir) if they are not already set. - 5. Build the TrainingConfig object from the final config dict and return it. + 4. Build the TrainingConfig object from the final config dict and return it. """ combined_config = build_config_dict_from_files(config_files) cast_types(overrides, TrainingConfig) merge_two_dicts(merge_into=combined_config, merge_from=overrides) - set_backup_vals(combined_config, config_files) return from_dict(TrainingConfig, combined_config, config=dacite_config(strict=True)) diff --git a/src/delphi/train/run_context.py b/delphi/train/run_context.py similarity index 100% rename from src/delphi/train/run_context.py rename to delphi/train/run_context.py diff --git a/src/delphi/train/shuffle.py b/delphi/train/shuffle.py similarity index 100% rename from src/delphi/train/shuffle.py rename to delphi/train/shuffle.py diff --git a/src/delphi/train/train_step.py b/delphi/train/train_step.py similarity index 100% rename from src/delphi/train/train_step.py rename to delphi/train/train_step.py diff --git a/src/delphi/train/training.py b/delphi/train/training.py similarity index 93% rename from src/delphi/train/training.py rename to delphi/train/training.py index e1d65adc..575f9844 100644 --- a/src/delphi/train/training.py +++ b/delphi/train/training.py @@ -5,6 +5,7 @@ from pathlib import Path import torch +from huggingface_hub import HfApi from tqdm import tqdm from transformers import AutoTokenizer @@ -25,22 +26,23 @@ def setup_training(config: TrainingConfig): logging.info("Setting up training...") - os.makedirs(config.output_dir, exist_ok=True) + os.makedirs(config.out_dir, exist_ok=True) - # torch misc - TODO: check if this is actually needed torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn - # determinism setup_determinism(config.torch_seed) - # wandb setup + if config.out_repo: + api = HfApi() + api.create_repo(config.out_repo, exist_ok=True) + if config.wandb: - init_wandb(config=config) + init_wandb(config) if config.tokenizer: tokenizer = AutoTokenizer.from_pretrained(config.tokenizer) - tokenizer.save_pretrained(Path(config.output_dir) / "tokenizer") + tokenizer.save_pretrained(Path(config.out_dir) / "tokenizer") def run_training(config: TrainingConfig) -> tuple[ModelTrainingState, RunContext]: diff --git a/src/delphi/train/utils.py b/delphi/train/utils.py similarity index 93% rename from src/delphi/train/utils.py rename to delphi/train/utils.py index 4eadb2c6..af55c0c7 100644 --- a/src/delphi/train/utils.py +++ b/delphi/train/utils.py @@ -17,9 +17,10 @@ from torch.optim import AdamW from transformers import PreTrainedModel +from delphi.train.config import dot_notation_to_dict + from .config import TrainingConfig from .run_context import RunContext -from .shuffle import shuffle_list @dataclass @@ -198,8 +199,8 @@ def save_results( config, context (e.g. hardware), training step, etc """ iter_name = "main" if final else f"iter{train_results.iter_num}" - output_dir = Path(config.output_dir) - results_path = output_dir / iter_name + out_dir = Path(config.out_dir) + results_path = out_dir / iter_name logging.info(f"saving checkpoint to {results_path}") results_path.mkdir(parents=True, exist_ok=True) with open(results_path / "training_config.json", "w") as file: @@ -220,19 +221,18 @@ def save_results( json.dump(training_state_dict, file, indent=2) with open(results_path / "run_context.json", "w") as file: json.dump(run_context.asdict(), file, indent=2) - if (tokenizer_dir := output_dir / "tokenizer").exists(): + if (tokenizer_dir := out_dir / "tokenizer").exists(): for src_file in tokenizer_dir.iterdir(): if src_file.is_file(): dest_file = results_path / src_file.name shutil.copy2(src_file, dest_file) - if config.out_repo_id: + if config.out_repo: try: api = HfApi() - api.create_repo(config.out_repo_id, exist_ok=True) - api.create_branch(config.out_repo_id, branch=iter_name, exist_ok=True) + api.create_branch(config.out_repo, branch=iter_name, exist_ok=True) api.upload_folder( folder_path=results_path, - repo_id=config.out_repo_id, + repo_id=config.out_repo, revision=iter_name, ) except Exception as e: @@ -255,3 +255,9 @@ def init_model(model_config_dict: dict[str, Any], seed: int) -> PreTrainedModel: model_params_dict = model_config_dict.copy() model_params_dict.pop("model_class") return model_class(config_class(**(model_params_dict))) + + +def overrides_to_dict(overrides: list[str]) -> dict[str, Any]: + # ["a.b.c=4", "foo=false"] to {"a": {"b": {"c": 4}}, "foo": False} + config_vars = {k: v for k, v in [x.split("=") for x in overrides if "=" in x]} + return dot_notation_to_dict(config_vars) diff --git a/src/delphi/train/wandb_utils.py b/delphi/train/wandb_utils.py similarity index 68% rename from src/delphi/train/wandb_utils.py rename to delphi/train/wandb_utils.py index 83c53912..f420b5da 100644 --- a/src/delphi/train/wandb_utils.py +++ b/delphi/train/wandb_utils.py @@ -1,5 +1,4 @@ import logging -import os from dataclasses import asdict import wandb @@ -8,19 +7,12 @@ from .utils import ModelTrainingState -def silence_wandb(): - logging.info("silencing wandb output") - os.environ["WANDB_SILENT"] = "true" - - def init_wandb(config: TrainingConfig): - # if log level < debug, silence wandb - assert config.wandb is not None - if logging.getLogger().level > logging.INFO or config.wandb.silence: - silence_wandb() + assert "/" in config.wandb, "wandb should be in the 'entity/project' form" + wandb_entity, wandb_project = config.wandb.split("/") wandb.init( - entity=config.wandb.entity, - project=config.wandb.project, + entity=wandb_entity, + project=wandb_project, name=config.run_name, config=asdict(config), ) diff --git a/src/delphi/utils.py b/delphi/utils.py similarity index 50% rename from src/delphi/utils.py rename to delphi/utils.py index 0ceb059a..15f79545 100644 --- a/src/delphi/utils.py +++ b/delphi/utils.py @@ -1,20 +1,22 @@ +from collections.abc import Callable from typing import cast +import torch from datasets import Dataset, Features, Sequence, Value, load_dataset +from jaxtyping import Float, Int def hf_split_to_split_name(split: str) -> str: return split.split("[")[0] -# TODO: test load_dataset functions def load_dataset_split_features( - repo_id: str, + path: str, split: str, features: Features, ) -> Dataset: dataset = load_dataset( - repo_id, + path, split=split, features=features, ) @@ -23,28 +25,28 @@ def load_dataset_split_features( def load_dataset_split_string_feature( - repo_id: str, + path: str, split: str, feature_name: str, ) -> Dataset: print("Loading string dataset") - print(f"{repo_id=}, {split=}, {feature_name=}") + print(f"{path=}, {split=}, {feature_name=}") return load_dataset_split_features( - repo_id, + path, split, Features({feature_name: Value("string")}), ) def load_dataset_split_sequence_int32_feature( - repo_id: str, + path: str, split: str, feature_name: str, ) -> Dataset: print("Loading sequence int32 dataset") - print(f"{repo_id=}, {split=}, {feature_name=}") + print(f"{path=}, {split=}, {feature_name=}") return load_dataset_split_features( - repo_id, + path, split, Features({feature_name: Sequence(Value("int32"))}), ) @@ -56,3 +58,30 @@ def get_all_hf_branch_names(repo_id: str) -> list[str]: api = HfApi() refs = api.list_repo_refs(repo_id) return [branch.name for branch in refs.branches] + + +def gather_logprobs( + logprobs: Float[torch.Tensor, "batch seq vocab"], + tokens: Int[torch.Tensor, "batch seq"], +) -> Float[torch.Tensor, "batch seq"]: + return torch.gather(logprobs, -1, tokens.unsqueeze(-1)).squeeze(-1) + + +def get_all_logprobs( + model: Callable, input_ids: Int[torch.Tensor, "batch seq"] +) -> Float[torch.Tensor, "batch seq vocab"]: + # batch, seq, vocab + logits = model(input_ids).logits + return torch.log_softmax(logits, dim=-1) + + +def get_all_and_next_logprobs( + model: Callable, + input_ids: Int[torch.Tensor, "batch seq"], +) -> tuple[ + Float[torch.Tensor, "batch shorter_seq vocab"], + Float[torch.Tensor, "batch shorter_seq"], +]: + logprobs = get_all_logprobs(model, input_ids[:, :-1]) + next_tokens = input_ids[:, 1:] + return logprobs, gather_logprobs(logprobs, next_tokens) diff --git a/notebooks/end2end_demo.ipynb b/notebooks/end2end_demo.ipynb deleted file mode 100644 index 3f8e938d..00000000 --- a/notebooks/end2end_demo.ipynb +++ /dev/null @@ -1,136 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Imports" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "from typing import cast\n", - "import pickle\n", - "from collections import defaultdict\n", - "\n", - "from datasets import load_dataset, Dataset\n", - "\n", - "from delphi.constants import STATIC_ASSETS_DIR\n", - "from delphi.eval import utils\n", - "from delphi.eval import constants\n", - "from delphi.eval.vis_per_token_model import visualize_per_token_category\n", - "\n", - "# from delphi.eval.calc_model_group_stats import calc_model_group_stats\n", - "from delphi.eval.spacy_token_labelling import TOKEN_LABELS" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Data" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "# load data\n", - "tokenized_corpus_dataset = cast(Dataset, load_dataset(\n", - " constants.tokenized_corpus_dataset,\n", - " split=\"validation\"\n", - "))\n", - "\n", - "# TODO: convert to use static paths\n", - "# with open(\"../src/delphi/eval/labelled_token_ids_dict.pkl\", \"rb\") as f:\n", - "# token_groups = pickle.load(f)\n", - "# model_group_stats = calc_model_group_stats(\n", - "# tokenized_corpus_dataset, logprob_datasets, token_groups, token_groups[0].keys()\n", - "# )\n", - "with open(f\"{STATIC_ASSETS_DIR}/model_group_stats.pkl\", \"rb\") as f:\n", - " model_group_stats = pickle.load(f)\n", - "\n", - "logprob_datasets = utils.load_logprob_datasets(\"validation\")\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Visualization" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "d6c18c9588f3499b94e89ccea5954780", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "VBox(children=(Dropdown(description='Token Category:', options=('Capitalized', 'Is Determiner', 'Is Interjunct…" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "performance_data = defaultdict(dict)\n", - "for model in constants.LLAMA2_MODELS:\n", - " for token_group_desc in TOKEN_LABELS:\n", - " if (model, token_group_desc) not in model_group_stats:\n", - " continue\n", - " stats = model_group_stats[(model, token_group_desc)]\n", - " performance_data[model][token_group_desc] = (\n", - " -stats[\"median\"],\n", - " -stats[\"75th\"],\n", - " -stats[\"25th\"],\n", - " )\n", - "\n", - "visualize_per_token_category(\n", - " performance_data,\n", - " log_scale=True,\n", - " bg_color=\"LightGrey\",\n", - " line_color=\"Red\",\n", - " marker_color=\"Orange\",\n", - " bar_color=\"Green\",\n", - ")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "tinyevals", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/notebooks/eval_notebook.ipynb b/notebooks/eval_notebook.ipynb new file mode 100644 index 00000000..daed1f52 --- /dev/null +++ b/notebooks/eval_notebook.ipynb @@ -0,0 +1,480 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# colab cells (only run if on colab)\n", + "# TODO: experiment on colab to see how to set up the environment" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Important\n", + "\n", + "Run this cell by cell. The token selecter cell needs to be ran first so the later cells work." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "application/javascript": "(function(root) {\n function now() {\n return new Date();\n }\n\n var force = true;\n var py_version = '3.4.1'.replace('rc', '-rc.').replace('.dev', '-dev.');\n var reloading = false;\n var Bokeh = root.Bokeh;\n\n if (typeof (root._bokeh_timeout) === \"undefined\" || force) {\n root._bokeh_timeout = Date.now() + 5000;\n root._bokeh_failed_load = false;\n }\n\n function run_callbacks() {\n try {\n root._bokeh_onload_callbacks.forEach(function(callback) {\n if (callback != null)\n callback();\n });\n } finally {\n delete root._bokeh_onload_callbacks;\n }\n console.debug(\"Bokeh: all callbacks have finished\");\n }\n\n function load_libs(css_urls, js_urls, js_modules, js_exports, callback) {\n if (css_urls == null) css_urls = [];\n if (js_urls == null) js_urls = [];\n if (js_modules == null) js_modules = [];\n if (js_exports == null) js_exports = {};\n\n root._bokeh_onload_callbacks.push(callback);\n\n if (root._bokeh_is_loading > 0) {\n console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n return null;\n }\n if (js_urls.length === 0 && js_modules.length === 0 && Object.keys(js_exports).length === 0) {\n run_callbacks();\n return null;\n }\n if (!reloading) {\n console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n }\n\n function on_load() {\n root._bokeh_is_loading--;\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n run_callbacks()\n }\n }\n window._bokeh_on_load = on_load\n\n function on_error() {\n console.error(\"failed to load \" + url);\n }\n\n var skip = [];\n if (window.requirejs) {\n window.requirejs.config({'packages': {}, 'paths': {}, 'shim': {}});\n root._bokeh_is_loading = css_urls.length + 0;\n } else {\n root._bokeh_is_loading = css_urls.length + js_urls.length + js_modules.length + Object.keys(js_exports).length;\n }\n\n var existing_stylesheets = []\n var links = document.getElementsByTagName('link')\n for (var i = 0; i < links.length; i++) {\n var link = links[i]\n if (link.href != null) {\n\texisting_stylesheets.push(link.href)\n }\n }\n for (var i = 0; i < css_urls.length; i++) {\n var url = css_urls[i];\n if (existing_stylesheets.indexOf(url) !== -1) {\n\ton_load()\n\tcontinue;\n }\n const element = document.createElement(\"link\");\n element.onload = on_load;\n element.onerror = on_error;\n element.rel = \"stylesheet\";\n element.type = \"text/css\";\n element.href = url;\n console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n document.body.appendChild(element);\n } var existing_scripts = []\n var scripts = document.getElementsByTagName('script')\n for (var i = 0; i < scripts.length; i++) {\n var script = scripts[i]\n if (script.src != null) {\n\texisting_scripts.push(script.src)\n }\n }\n for (var i = 0; i < js_urls.length; i++) {\n var url = js_urls[i];\n if (skip.indexOf(url) !== -1 || existing_scripts.indexOf(url) !== -1) {\n\tif (!window.requirejs) {\n\t on_load();\n\t}\n\tcontinue;\n }\n var element = document.createElement('script');\n element.onload = on_load;\n element.onerror = on_error;\n element.async = false;\n element.src = url;\n console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n document.head.appendChild(element);\n }\n for (var i = 0; i < js_modules.length; i++) {\n var url = js_modules[i];\n if (skip.indexOf(url) !== -1 || existing_scripts.indexOf(url) !== -1) {\n\tif (!window.requirejs) {\n\t on_load();\n\t}\n\tcontinue;\n }\n var element = document.createElement('script');\n element.onload = on_load;\n element.onerror = on_error;\n element.async = false;\n element.src = url;\n element.type = \"module\";\n console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n document.head.appendChild(element);\n }\n for (const name in js_exports) {\n var url = js_exports[name];\n if (skip.indexOf(url) >= 0 || root[name] != null) {\n\tif (!window.requirejs) {\n\t on_load();\n\t}\n\tcontinue;\n }\n var element = document.createElement('script');\n element.onerror = on_error;\n element.async = false;\n element.type = \"module\";\n console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n element.textContent = `\n import ${name} from \"${url}\"\n window.${name} = ${name}\n window._bokeh_on_load()\n `\n document.head.appendChild(element);\n }\n if (!js_urls.length && !js_modules.length) {\n on_load()\n }\n };\n\n function inject_raw_css(css) {\n const element = document.createElement(\"style\");\n element.appendChild(document.createTextNode(css));\n document.body.appendChild(element);\n }\n\n var js_urls = [\"https://cdn.bokeh.org/bokeh/release/bokeh-3.4.1.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-gl-3.4.1.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-3.4.1.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-3.4.1.min.js\", \"https://cdn.holoviz.org/panel/1.4.0/dist/panel.min.js\"];\n var js_modules = [];\n var js_exports = {};\n var css_urls = [\"https://cdn.holoviz.org/panel/1.4.0/dist/bundled/font-awesome/css/all.min.css\"];\n var inline_js = [ function(Bokeh) {\n Bokeh.set_log_level(\"info\");\n },\nfunction(Bokeh) {} // ensure no trailing comma for IE\n ];\n\n function run_inline_js() {\n if ((root.Bokeh !== undefined) || (force === true)) {\n for (var i = 0; i < inline_js.length; i++) {\n\ttry {\n inline_js[i].call(root, root.Bokeh);\n\t} catch(e) {\n\t if (!reloading) {\n\t throw e;\n\t }\n\t}\n }\n // Cache old bokeh versions\n if (Bokeh != undefined && !reloading) {\n\tvar NewBokeh = root.Bokeh;\n\tif (Bokeh.versions === undefined) {\n\t Bokeh.versions = new Map();\n\t}\n\tif (NewBokeh.version !== Bokeh.version) {\n\t Bokeh.versions.set(NewBokeh.version, NewBokeh)\n\t}\n\troot.Bokeh = Bokeh;\n }} else if (Date.now() < root._bokeh_timeout) {\n setTimeout(run_inline_js, 100);\n } else if (!root._bokeh_failed_load) {\n console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n root._bokeh_failed_load = true;\n }\n root._bokeh_is_initializing = false\n }\n\n function load_or_wait() {\n // Implement a backoff loop that tries to ensure we do not load multiple\n // versions of Bokeh and its dependencies at the same time.\n // In recent versions we use the root._bokeh_is_initializing flag\n // to determine whether there is an ongoing attempt to initialize\n // bokeh, however for backward compatibility we also try to ensure\n // that we do not start loading a newer (Panel>=1.0 and Bokeh>3) version\n // before older versions are fully initialized.\n if (root._bokeh_is_initializing && Date.now() > root._bokeh_timeout) {\n root._bokeh_is_initializing = false;\n root._bokeh_onload_callbacks = undefined;\n console.log(\"Bokeh: BokehJS was loaded multiple times but one version failed to initialize.\");\n load_or_wait();\n } else if (root._bokeh_is_initializing || (typeof root._bokeh_is_initializing === \"undefined\" && root._bokeh_onload_callbacks !== undefined)) {\n setTimeout(load_or_wait, 100);\n } else {\n root._bokeh_is_initializing = true\n root._bokeh_onload_callbacks = []\n var bokeh_loaded = Bokeh != null && (Bokeh.version === py_version || (Bokeh.versions !== undefined && Bokeh.versions.has(py_version)));\n if (!reloading && !bokeh_loaded) {\n\troot.Bokeh = undefined;\n }\n load_libs(css_urls, js_urls, js_modules, js_exports, function() {\n\tconsole.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n\trun_inline_js();\n });\n }\n }\n // Give older versions of the autoload script a head-start to ensure\n // they initialize before we start loading newer version.\n setTimeout(load_or_wait, 100)\n}(window));", + "application/vnd.holoviews_load.v0+json": "" + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": "\nif ((window.PyViz === undefined) || (window.PyViz instanceof HTMLElement)) {\n window.PyViz = {comms: {}, comm_status:{}, kernels:{}, receivers: {}, plot_index: []}\n}\n\n\n function JupyterCommManager() {\n }\n\n JupyterCommManager.prototype.register_target = function(plot_id, comm_id, msg_handler) {\n if (window.comm_manager || ((window.Jupyter !== undefined) && (Jupyter.notebook.kernel != null))) {\n var comm_manager = window.comm_manager || Jupyter.notebook.kernel.comm_manager;\n comm_manager.register_target(comm_id, function(comm) {\n comm.on_msg(msg_handler);\n });\n } else if ((plot_id in window.PyViz.kernels) && (window.PyViz.kernels[plot_id])) {\n window.PyViz.kernels[plot_id].registerCommTarget(comm_id, function(comm) {\n comm.onMsg = msg_handler;\n });\n } else if (typeof google != 'undefined' && google.colab.kernel != null) {\n google.colab.kernel.comms.registerTarget(comm_id, (comm) => {\n var messages = comm.messages[Symbol.asyncIterator]();\n function processIteratorResult(result) {\n var message = result.value;\n console.log(message)\n var content = {data: message.data, comm_id};\n var buffers = []\n for (var buffer of message.buffers || []) {\n buffers.push(new DataView(buffer))\n }\n var metadata = message.metadata || {};\n var msg = {content, buffers, metadata}\n msg_handler(msg);\n return messages.next().then(processIteratorResult);\n }\n return messages.next().then(processIteratorResult);\n })\n }\n }\n\n JupyterCommManager.prototype.get_client_comm = function(plot_id, comm_id, msg_handler) {\n if (comm_id in window.PyViz.comms) {\n return window.PyViz.comms[comm_id];\n } else if (window.comm_manager || ((window.Jupyter !== undefined) && (Jupyter.notebook.kernel != null))) {\n var comm_manager = window.comm_manager || Jupyter.notebook.kernel.comm_manager;\n var comm = comm_manager.new_comm(comm_id, {}, {}, {}, comm_id);\n if (msg_handler) {\n comm.on_msg(msg_handler);\n }\n } else if ((plot_id in window.PyViz.kernels) && (window.PyViz.kernels[plot_id])) {\n var comm = window.PyViz.kernels[plot_id].connectToComm(comm_id);\n comm.open();\n if (msg_handler) {\n comm.onMsg = msg_handler;\n }\n } else if (typeof google != 'undefined' && google.colab.kernel != null) {\n var comm_promise = google.colab.kernel.comms.open(comm_id)\n comm_promise.then((comm) => {\n window.PyViz.comms[comm_id] = comm;\n if (msg_handler) {\n var messages = comm.messages[Symbol.asyncIterator]();\n function processIteratorResult(result) {\n var message = result.value;\n var content = {data: message.data};\n var metadata = message.metadata || {comm_id};\n var msg = {content, metadata}\n msg_handler(msg);\n return messages.next().then(processIteratorResult);\n }\n return messages.next().then(processIteratorResult);\n }\n }) \n var sendClosure = (data, metadata, buffers, disposeOnDone) => {\n return comm_promise.then((comm) => {\n comm.send(data, metadata, buffers, disposeOnDone);\n });\n };\n var comm = {\n send: sendClosure\n };\n }\n window.PyViz.comms[comm_id] = comm;\n return comm;\n }\n window.PyViz.comm_manager = new JupyterCommManager();\n \n\n\nvar JS_MIME_TYPE = 'application/javascript';\nvar HTML_MIME_TYPE = 'text/html';\nvar EXEC_MIME_TYPE = 'application/vnd.holoviews_exec.v0+json';\nvar CLASS_NAME = 'output';\n\n/**\n * Render data to the DOM node\n */\nfunction render(props, node) {\n var div = document.createElement(\"div\");\n var script = document.createElement(\"script\");\n node.appendChild(div);\n node.appendChild(script);\n}\n\n/**\n * Handle when a new output is added\n */\nfunction handle_add_output(event, handle) {\n var output_area = handle.output_area;\n var output = handle.output;\n if ((output.data == undefined) || (!output.data.hasOwnProperty(EXEC_MIME_TYPE))) {\n return\n }\n var id = output.metadata[EXEC_MIME_TYPE][\"id\"];\n var toinsert = output_area.element.find(\".\" + CLASS_NAME.split(' ')[0]);\n if (id !== undefined) {\n var nchildren = toinsert.length;\n var html_node = toinsert[nchildren-1].children[0];\n html_node.innerHTML = output.data[HTML_MIME_TYPE];\n var scripts = [];\n var nodelist = html_node.querySelectorAll(\"script\");\n for (var i in nodelist) {\n if (nodelist.hasOwnProperty(i)) {\n scripts.push(nodelist[i])\n }\n }\n\n scripts.forEach( function (oldScript) {\n var newScript = document.createElement(\"script\");\n var attrs = [];\n var nodemap = oldScript.attributes;\n for (var j in nodemap) {\n if (nodemap.hasOwnProperty(j)) {\n attrs.push(nodemap[j])\n }\n }\n attrs.forEach(function(attr) { newScript.setAttribute(attr.name, attr.value) });\n newScript.appendChild(document.createTextNode(oldScript.innerHTML));\n oldScript.parentNode.replaceChild(newScript, oldScript);\n });\n if (JS_MIME_TYPE in output.data) {\n toinsert[nchildren-1].children[1].textContent = output.data[JS_MIME_TYPE];\n }\n output_area._hv_plot_id = id;\n if ((window.Bokeh !== undefined) && (id in Bokeh.index)) {\n window.PyViz.plot_index[id] = Bokeh.index[id];\n } else {\n window.PyViz.plot_index[id] = null;\n }\n } else if (output.metadata[EXEC_MIME_TYPE][\"server_id\"] !== undefined) {\n var bk_div = document.createElement(\"div\");\n bk_div.innerHTML = output.data[HTML_MIME_TYPE];\n var script_attrs = bk_div.children[0].attributes;\n for (var i = 0; i < script_attrs.length; i++) {\n toinsert[toinsert.length - 1].childNodes[1].setAttribute(script_attrs[i].name, script_attrs[i].value);\n }\n // store reference to server id on output_area\n output_area._bokeh_server_id = output.metadata[EXEC_MIME_TYPE][\"server_id\"];\n }\n}\n\n/**\n * Handle when an output is cleared or removed\n */\nfunction handle_clear_output(event, handle) {\n var id = handle.cell.output_area._hv_plot_id;\n var server_id = handle.cell.output_area._bokeh_server_id;\n if (((id === undefined) || !(id in PyViz.plot_index)) && (server_id !== undefined)) { return; }\n var comm = window.PyViz.comm_manager.get_client_comm(\"hv-extension-comm\", \"hv-extension-comm\", function () {});\n if (server_id !== null) {\n comm.send({event_type: 'server_delete', 'id': server_id});\n return;\n } else if (comm !== null) {\n comm.send({event_type: 'delete', 'id': id});\n }\n delete PyViz.plot_index[id];\n if ((window.Bokeh !== undefined) & (id in window.Bokeh.index)) {\n var doc = window.Bokeh.index[id].model.document\n doc.clear();\n const i = window.Bokeh.documents.indexOf(doc);\n if (i > -1) {\n window.Bokeh.documents.splice(i, 1);\n }\n }\n}\n\n/**\n * Handle kernel restart event\n */\nfunction handle_kernel_cleanup(event, handle) {\n delete PyViz.comms[\"hv-extension-comm\"];\n window.PyViz.plot_index = {}\n}\n\n/**\n * Handle update_display_data messages\n */\nfunction handle_update_output(event, handle) {\n handle_clear_output(event, {cell: {output_area: handle.output_area}})\n handle_add_output(event, handle)\n}\n\nfunction register_renderer(events, OutputArea) {\n function append_mime(data, metadata, element) {\n // create a DOM node to render to\n var toinsert = this.create_output_subarea(\n metadata,\n CLASS_NAME,\n EXEC_MIME_TYPE\n );\n this.keyboard_manager.register_events(toinsert);\n // Render to node\n var props = {data: data, metadata: metadata[EXEC_MIME_TYPE]};\n render(props, toinsert[0]);\n element.append(toinsert);\n return toinsert\n }\n\n events.on('output_added.OutputArea', handle_add_output);\n events.on('output_updated.OutputArea', handle_update_output);\n events.on('clear_output.CodeCell', handle_clear_output);\n events.on('delete.Cell', handle_clear_output);\n events.on('kernel_ready.Kernel', handle_kernel_cleanup);\n\n OutputArea.prototype.register_mime_type(EXEC_MIME_TYPE, append_mime, {\n safe: true,\n index: 0\n });\n}\n\nif (window.Jupyter !== undefined) {\n try {\n var events = require('base/js/events');\n var OutputArea = require('notebook/js/outputarea').OutputArea;\n if (OutputArea.prototype.mime_types().indexOf(EXEC_MIME_TYPE) == -1) {\n register_renderer(events, OutputArea);\n }\n } catch(err) {\n }\n}\n", + "application/vnd.holoviews_load.v0+json": "" + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.holoviews_exec.v0+json": "", + "text/html": [ + "
\n", + "
\n", + "
\n", + "" + ] + }, + "metadata": { + "application/vnd.holoviews_exec.v0+json": { + "id": "2ac0543f-58a1-4b93-9570-a2c0e6f09501" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "# imports\n", + "import torch\n", + "import panel as pn\n", + "from delphi.eval import token_selector, vis_pos_map, calc_model_group_stats, visualize_selected_tokens, get_all_tok_metrics_in_label\n", + "from datasets import load_dataset, Dataset\n", + "from transformers import AutoTokenizer\n", + "from typing import cast\n", + "import ipywidgets as widgets\n", + "\n", + "# refer to https://panel.holoviz.org/reference/panes/IPyWidget.html to integrate ipywidgets with panel\n", + "pn.extension('ipywidgets')\n", + "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# specify model names (or checkpoints)\n", + "prefix = \"delphi-suite/v0-next-logprobs-llama2-\"\n", + "suffixes = [\n", + " \"100k\",\n", + " \"200k\",\n", + " \"400k\",\n", + "] # , \"800k\", \"1.6m\", \"3.2m\", \"6.4m\", \"12.8m\", \"25.6m\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# load next logprobs data for all models\n", + "split = \"validation[:100]\"\n", + "next_logprobs = {\n", + " suffix: cast(\n", + " Dataset,\n", + " load_dataset(f\"{prefix}{suffix}\", split=split),\n", + " )\n", + " .with_format(\"torch\")\n", + " .map(lambda x: {\"logprobs\": x[\"logprobs\"].to(device)})\n", + " for suffix in suffixes\n", + "}\n", + "next_logprobs_plot = {k: d[\"logprobs\"] for k, d in next_logprobs.items()}\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# load the tokenized dataset\n", + "tokenized_corpus_dataset = (\n", + " cast(\n", + " Dataset,\n", + " load_dataset(\"delphi-suite/stories-tokenized\", split=split),\n", + " )\n", + " .with_format(\"torch\")\n", + " .map(lambda x: {\"tokens\": x[\"tokens\"].to(device)})\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Run this notebook until the following cell, then the rest should work." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/jett/Documents/jett/delphi/.venv/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "29c01ed2f022418ebc6a7c4f2d8210b4", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "BokehModel(combine_events=True, render_bundle={'docs_json': {'f8a47e67-7cc8-4e1f-bc8e-c10325c540a2': {'version…" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# specific token specification\n", + "tokenizer = AutoTokenizer.from_pretrained(\"delphi-suite/stories-tokenizer\")\n", + "\n", + "# Count the frequency of each token using torch.bincount\n", + "token_counts = torch.bincount(tokenized_corpus_dataset[\"tokens\"].view(-1))\n", + "\n", + "# Get the indices that would sort the token counts in descending order\n", + "sorted_indices = torch.argsort(token_counts, descending=True)\n", + "\n", + "# Get the token IDs in descending order of frequency\n", + "valid_tok_ids = sorted_indices.tolist()\n", + "def format_fix(s):\n", + " if s.startswith(\" \"):\n", + " return \"_\" + s[1:]\n", + " return s\n", + "vocab = {format_fix(tokenizer.decode(t, clean_up_tokenization_spaces=True)): t for t in sorted_indices.tolist() if token_counts[t] > 0}\n", + "\n", + "\n", + "selector, selected_ids = token_selector(vocab) # use selected_ids as a dynamic variable\n", + "pn.Row(selector, height=500).servable()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Selected IDs: [40, 2, 14]\n" + ] + } + ], + "source": [ + "if not selected_ids:\n", + " selected_ids = [40, 2, 14]\n", + "print(\"Selected IDs:\", selected_ids)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([100, 512])" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(next_logprobs_plot.values())[0].shape" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Processing model 100k\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Processing model 200k\n", + "Processing model 400k\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "2084f5f7ca5e4aeca85b0f5c821eaced", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FigureWidget({\n", + " 'data': [{'line': {'width': 0},\n", + " 'marker': {'color': 'rgba(68, 68, 68, 0.3)'},\n", + " 'mode': 'lines',\n", + " 'name': 'Upper Bound',\n", + " 'showlegend': False,\n", + " 'type': 'scatter',\n", + " 'uid': '3348f11d-9719-4274-9954-43a9dc8f2ce1',\n", + " 'x': [100k, 200k, 400k],\n", + " 'y': array([4.6017912 , 4.03893679, 3.46496367])},\n", + " {'fill': 'tonexty',\n", + " 'fillcolor': 'rgba(68, 68, 68, 0.3)',\n", + " 'line': {'width': 0},\n", + " 'marker': {'color': 'rgba(68, 68, 68, 0.3)'},\n", + " 'mode': 'lines',\n", + " 'name': 'Lower Bound',\n", + " 'showlegend': False,\n", + " 'type': 'scatter',\n", + " 'uid': '2a90892b-69a9-49d6-bad0-086ee4837fcc',\n", + " 'x': [100k, 200k, 400k],\n", + " 'y': array([1.00667199, 0.88813308, 0.735852 ])},\n", + " {'marker': {'color': 'rgb(31, 119, 180)', 'line': {'color': 'rgb(31, 119, 180)', 'width': 1}, 'size': 0},\n", + " 'mode': 'lines',\n", + " 'name': 'Means',\n", + " 'type': 'scatter',\n", + " 'uid': '24ce8dc7-90cf-48f7-9722-de6cc02ba5a1',\n", + " 'x': [100k, 200k, 400k],\n", + " 'y': array([1.39094847, 1.15670866, 0.93012363])}],\n", + " 'layout': {'template': '...'}\n", + "})" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model_group_stats = calc_model_group_stats( # i'm not sure if tokenized_corpus_dataset.tolist() is the right input, it was list(tokenized_corpus_dataset) before\n", + " tokenized_corpus_dataset, next_logprobs_plot, selected_ids\n", + ")\n", + "performance_data = {}\n", + "for suffix in suffixes:\n", + " stats = model_group_stats[suffix]\n", + " performance_data[suffix] = (\n", + " -stats[\"median\"],\n", + " -stats[\"75th\"],\n", + " -stats[\"25th\"],\n", + " )\n", + "\n", + "visualize_selected_tokens(performance_data, log_scale=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "3dee1d95570945f3a119c830cdd6d9b6", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "interactive(children=(FloatRangeSlider(value=(0.25, 0.75), description='Quantiles', max=1.0, step=0.05), Dropd…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def show_pos_map(\n", + " quantile: tuple[float, float],\n", + " model_name_1: str,\n", + " model_name_2: str,\n", + "):\n", + " logprobs_diff = next_logprobs[model_name_2][\"logprobs\"] - next_logprobs[model_name_1][\"logprobs\"] # type: ignore\n", + " pos_to_diff = get_all_tok_metrics_in_label(tokenized_corpus_dataset[\"tokens\"], selected_tokens=selected_ids, metrics=logprobs_diff, q_start=quantile[0], q_end=quantile[1]) # type: ignore\n", + " try:\n", + " _ = vis_pos_map(list(pos_to_diff.keys()), selected_ids, logprobs_diff, tokenized_corpus_dataset[\"tokens\"], tokenizer) # type: ignore\n", + " except ValueError:\n", + " if pos_to_diff == {}:\n", + " print(\"No tokens found in this label\")\n", + " return\n", + "\n", + "\n", + "widgets.interact_manual(\n", + " show_pos_map,\n", + " quantile=widgets.FloatRangeSlider(\n", + " min=0.0, max=1.0, step=0.05, description=\"Quantiles\"\n", + " ),\n", + " samples=widgets.IntSlider(min=1, max=5, description=\"Samples\", value=2),\n", + " model_name_1=widgets.Dropdown(\n", + " options=suffixes,\n", + " description=\"Model 1\",\n", + " value=\"100k\",\n", + " ),\n", + " model_name_2=widgets.Dropdown(\n", + " options=suffixes,\n", + " description=\"Model 2\",\n", + " value=\"200k\",\n", + " ),\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/model_diff.ipynb b/notebooks/model_diff.ipynb deleted file mode 100644 index c3ea4429..00000000 --- a/notebooks/model_diff.ipynb +++ /dev/null @@ -1,171 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import torch\n", - "import pickle\n", - "\n", - "\n", - "from datasets import load_dataset, Dataset\n", - "\n", - "\n", - "from typing import cast\n", - "from ipywidgets import interact\n", - "import ipywidgets as widgets\n", - "\n", - "\n", - "from transformers import AutoTokenizer\n", - "from delphi.constants import STATIC_ASSETS_DIR\n", - "from delphi.eval.token_positions import get_all_tok_metrics_in_label\n", - "from delphi.eval.vis import vis_pos_map\n", - "from delphi.eval.constants import LLAMA2_NEXT_LOGPROBS_DATASETS_MAP\n", - "\n", - "# from delphi.train.utils import get_device\n", - "\n", - "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "tokenizer = AutoTokenizer.from_pretrained(\"delphi-suite/stories-tokenizer\")\n", - "token_ids = (\n", - " cast(\n", - " Dataset,\n", - " load_dataset(\n", - " \"delphi-suite/v0-tinystories-v2-clean-tokenized\", split=\"validation\"\n", - " ),\n", - " )\n", - " .with_format(\"torch\")\n", - " .map(lambda x: {\"tokens\": x[\"tokens\"].to(device)})\n", - ")\n", - "\n", - "next_logprobs = { # preloading all the logprobs datasets for interactive use\n", - " model_name: (\n", - " cast(\n", - " Dataset,\n", - " load_dataset(f\"{dataset_name}\", split=\"validation\"),\n", - " )\n", - " .with_format(\"torch\")\n", - " .map(lambda x: {\"logprobs\": x[\"logprobs\"].to(device)})\n", - " )\n", - " for model_name, dataset_name in LLAMA2_NEXT_LOGPROBS_DATASETS_MAP.items()\n", - "}\n", - "\n", - "token_labels_filename = \"labelled_token_ids_dict.pkl\"\n", - "with open(f\"{STATIC_ASSETS_DIR.joinpath(token_labels_filename)}\", \"rb\") as f:\n", - " token_labels = pickle.load(f)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "8e6f7079bf3b43bcb4b1afb904b36d11", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "interactive(children=(FloatRangeSlider(value=(0.25, 0.75), description='Start quantile', max=1.0, step=0.01), …" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "def show_pos_map(\n", - " quantile: tuple[float, float],\n", - " model_name_1: str,\n", - " model_name_2: str,\n", - " label: str,\n", - " samples: int,\n", - "):\n", - " token_id_t = token_ids[\"tokens\"]\n", - " logprobs_diff = next_logprobs[model_name_2][\"logprobs\"] - next_logprobs[model_name_1][\"logprobs\"] # type: ignore\n", - " pos_to_diff = get_all_tok_metrics_in_label(token_id_t, token_labels=token_labels, metrics=logprobs_diff, label=label, q_start=quantile[0], q_end=quantile[1]) # type: ignore\n", - " try:\n", - " _ = vis_pos_map(pos_to_diff, token_id_t, tokenizer, sample=samples) # type: ignore\n", - " except ValueError:\n", - " print(\"No tokens found in this label\")\n", - " return\n", - "\n", - "\n", - "interact(\n", - " show_pos_map,\n", - " quantile=widgets.FloatRangeSlider(\n", - " min=0.0, max=1.0, step=0.01, description=\"Start quantile\"\n", - " ),\n", - " samples=widgets.IntSlider(min=1, max=5, description=\"Samples\", value=2),\n", - " model_name_1=widgets.Dropdown(\n", - " options=LLAMA2_NEXT_LOGPROBS_DATASETS_MAP.keys(),\n", - " description=\"Model 1\",\n", - " value=\"llama2-100k\",\n", - " ),\n", - " model_name_2=widgets.Dropdown(\n", - " options=LLAMA2_NEXT_LOGPROBS_DATASETS_MAP.keys(),\n", - " description=\"Model 2\",\n", - " value=\"llama2-200k\",\n", - " ),\n", - " label=widgets.Dropdown(\n", - " options=token_labels[0].keys(), description=\"Label\", value=\"Is Noun\"\n", - " ),\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.6" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/notebooks/per_token_plot.ipynb b/notebooks/per_token_plot.ipynb deleted file mode 100644 index 12e09926..00000000 --- a/notebooks/per_token_plot.ipynb +++ /dev/null @@ -1,165 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "fbda6a916fe84814be64a40423196d76", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FigureWidget({\n", - " 'data': [{'line': {'width': 0},\n", - " 'marker': {'color': 'rgba(68, 68, 68, 0.3)'},\n", - " 'mode': 'lines',\n", - " 'name': 'Upper Bound',\n", - " 'showlegend': False,\n", - " 'type': 'scatter',\n", - " 'uid': 'a3590fcd-466d-4a73-b167-194ab728efcd',\n", - " 'x': [0, 1, 2, ..., 497, 498, 499],\n", - " 'y': array([2.34006592, 2.41241021, 2.57781922, ..., 2.56474203, 2.59573629,\n", - " 2.43304471])},\n", - " {'fill': 'tonexty',\n", - " 'fillcolor': 'rgba(68, 68, 68, 0.3)',\n", - " 'line': {'width': 0},\n", - " 'marker': {'color': 'rgba(68, 68, 68, 0.3)'},\n", - " 'mode': 'lines',\n", - " 'name': 'Lower Bound',\n", - " 'showlegend': False,\n", - " 'type': 'scatter',\n", - " 'uid': 'fda82808-c8ff-4b6c-878d-c76d66c8ce17',\n", - " 'x': [0, 1, 2, ..., 497, 498, 499],\n", - " 'y': array([0.93626447, 0.9302987 , 0.99836227, ..., 0.95607835, 0.76146911,\n", - " 0.81709211])},\n", - " {'marker': {'color': 'rgb(31, 119, 180)', 'line': {'color': 'rgb(31, 119, 180)', 'width': 1}, 'size': 0},\n", - " 'mode': 'lines',\n", - " 'name': 'Means',\n", - " 'type': 'scatter',\n", - " 'uid': 'b11dfbd0-c130-4a97-a8ff-b8c753b95035',\n", - " 'x': [0, 1, 2, ..., 497, 498, 499],\n", - " 'y': array([1.3701917 , 1.4372206 , 1.53251235, ..., 1.55583357, 1.50179179,\n", - " 1.45715223])}],\n", - " 'layout': {'template': '...'}\n", - "})" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from collections import defaultdict\n", - "import math\n", - "import random\n", - "import numpy as np\n", - "\n", - "from delphi.eval.vis_per_token_model import visualize_per_token_category\n", - "\n", - "\n", - "random.seed(0)\n", - "\n", - "# generate mock data\n", - "# model_names = ['llama2-100k', 'llama2-200k', 'llama2-1m', 'llama2-10m', \"0\"]\n", - "model_names = list(range(500))\n", - "categories = ['nouns', 'verbs', 'prepositions', 'adjectives']\n", - "entries = [200, 100, 150, 300, 100]*100\n", - "performance_data = defaultdict()\n", - "for i, model in enumerate(model_names):\n", - " performance_data[model] = defaultdict()\n", - " for cat in categories:\n", - " x = [math.log2(random.random()) for _ in range(entries[i])]\n", - " means = np.mean(x)\n", - " err_low = means - np.percentile(x, 25)\n", - " err_hi = np.percentile(x, 75) - means\n", - " performance_data[model][cat] = (-means, err_low, err_hi)\n", - "\n", - "\n", - "visualize_per_token_category(performance_data, log_scale=True, checkpoint_mode=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "993e5d66ae56462a8eeec2c9ac6bd972", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FigureWidget({\n", - " 'data': [{'line': {'width': 0},\n", - " 'marker': {'color': 'wheat'},\n", - " 'mode': 'lines',\n", - " 'name': 'Upper Bound',\n", - " 'showlegend': False,\n", - " 'type': 'scatter',\n", - " 'uid': '56999008-205c-4592-a3f7-ea61e3e09d8e',\n", - " 'x': [0, 1, 2, ..., 497, 498, 499],\n", - " 'y': array([2.34006592, 2.41241021, 2.57781922, ..., 2.56474203, 2.59573629,\n", - " 2.43304471])},\n", - " {'fill': 'tonexty',\n", - " 'fillcolor': 'wheat',\n", - " 'line': {'width': 0},\n", - " 'marker': {'color': 'wheat'},\n", - " 'mode': 'lines',\n", - " 'name': 'Lower Bound',\n", - " 'showlegend': False,\n", - " 'type': 'scatter',\n", - " 'uid': 'be8a04f1-b8c4-46af-bf5e-03c942eff19f',\n", - " 'x': [0, 1, 2, ..., 497, 498, 499],\n", - " 'y': array([0.93626447, 0.9302987 , 0.99836227, ..., 0.95607835, 0.76146911,\n", - " 0.81709211])},\n", - " {'marker': {'color': 'Orange', 'line': {'color': 'Orange', 'width': 1}, 'size': 0},\n", - " 'mode': 'lines',\n", - " 'name': 'Median',\n", - " 'type': 'scatter',\n", - " 'uid': '85fe5113-70fb-4aa7-9821-947287d84e1d',\n", - " 'x': [0, 1, 2, ..., 497, 498, 499],\n", - " 'y': array([1.3701917 , 1.4372206 , 1.53251235, ..., 1.55583357, 1.50179179,\n", - " 1.45715223])}],\n", - " 'layout': {'template': '...'}\n", - "})" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "visualize_per_token_category(performance_data, log_scale=True, checkpoint_mode=True, line_metric=\"Median\", line_color='Orange' , shade_color=\"wheat\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.0" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/notebooks/token_labelling.ipynb b/notebooks/token_labelling.ipynb deleted file mode 100644 index a0c8cf48..00000000 --- a/notebooks/token_labelling.ipynb +++ /dev/null @@ -1,760 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Giving tokens a label - How to categorize tokens\n", - "\n", - "\n", - "The first part of this Notebook contains elements that explain how to label tokens and how the functions work.\n", - "\n", - "The second part shows how all tokens are labelled that are used for our delphi language models.3\n" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The autoreload extension is already loaded. To reload it, use:\n", - " %reload_ext autoreload\n" - ] - } - ], - "source": [ - "# autoreload\n", - "%load_ext autoreload\n", - "%autoreload 2\n", - "\n", - "from pprint import pprint \n", - "\n", - "import spacy\n", - "from tqdm.auto import tqdm\n", - "\n", - "import delphi\n", - "\n", - "from delphi.eval import token_labelling" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "# 1) How to use the token labelling functions" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We analyze a simple sentence and receive the respective tokens with their analyzed attributes. \n", - "The grammatical/linguistic analysis is done by a model provided by spaCy for the English language." - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Peter \t PROPN \t nsubj \t PERSON\n", - "is \t AUX \t ROOT \t \n", - "a \t DET \t det \t \n", - "person \t NOUN \t attr \t \n" - ] - } - ], - "source": [ - "# Load the english model\n", - "nlp = spacy.load(\"en_core_web_sm\")\n", - "\n", - "# Create a Doc object from a given text\n", - "doc = nlp(\"Peter is a person\")\n", - "\n", - "token = doc[0]\n", - "for tok in doc:\n", - " print(tok,\"\\t\", tok.pos_, \"\\t\", tok.dep_, \"\\t\", tok.ent_type_)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's get the label for our custom token that we just printed." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'Capitalized': True,\n", - " 'Is Adjective': False,\n", - " 'Is Adposition': False,\n", - " 'Is Adverb': False,\n", - " 'Is Auxiliary': False,\n", - " 'Is Coordinating conjuction': False,\n", - " 'Is Determiner': False,\n", - " 'Is Interjunction': False,\n", - " 'Is Named Entity': True,\n", - " 'Is Noun': False,\n", - " 'Is Numeral': False,\n", - " 'Is Other': False,\n", - " 'Is Particle': False,\n", - " 'Is Pronoun': False,\n", - " 'Is Proper Noun': True,\n", - " 'Is Punctuation': False,\n", - " 'Is Subordinating conjuction': False,\n", - " 'Is Symbol': False,\n", - " 'Is Verb': False,\n", - " 'Starts with space': False}\n" - ] - } - ], - "source": [ - "from delphi.eval import token_labelling\n", - "\n", - "label = token_labelling.label_single_token(token)\n", - "pprint(label)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's get an understanding of what the labels acutally mean.\n", - "Use this function to receive an explanation for a single token." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "-------- Explanation of token labels --------\n", - "Token text: Peter\n", - "Token dependency: nominal subject\n", - "Token POS: proper noun\n", - "---------------- Token labels ---------------\n", - " 0 Starts with space False\n", - " 1 Capitalized True\n", - " 2 Is Adjective False\n", - " 3 Is Adposition False\n", - " 4 Is Adverb False\n", - " 5 Is Auxiliary False\n", - " 6 Is Coordinating conjuction False\n", - " 7 Is Determiner False\n", - " 8 Is Interjunction False\n", - " 9 Is Noun False\n", - " 10 Is Numeral False\n", - " 11 Is Particle False\n", - " 12 Is Pronoun False\n", - " 13 Is Proper Noun True\n", - " 14 Is Punctuation False\n", - " 15 Is Subordinating conjuction False\n", - " 16 Is Symbol False\n", - " 17 Is Verb False\n", - " 18 Is Other False\n", - " 19 Is Named Entity True\n" - ] - } - ], - "source": [ - "token_labelling.explain_token_labels(token)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If you are interested in all the possible labels a token can have, that spaCy is capable of assigning, then call the same function but without any argument:\n", - "```Python\n", - ">>> token_labelling.explain_token_labels()\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Batched token labelling\n", - "Next, let us analyze a batch of sentences and have them labelled.\n", - "> In the example below the input sentences are not yet tokenized, so spaCy uses its internal tokenizer." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Token: Peter\n", - "Starts with space | Capitalized | Is Adjective | Is Adposition | Is Adverb | Is Auxiliary | Is Coordinating conjuction | Is Determiner | Is Interjunction | Is Noun | Is Numeral | Is Particle | Is Pronoun | Is Proper Noun | Is Punctuation | Is Subordinating conjuction | Is Symbol | Is Verb | Is Other | Is Named Entity\n", - "False | True | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | True \n", - "---\n", - "Token: is\n", - "Starts with space | Capitalized | Is Adjective | Is Adposition | Is Adverb | Is Auxiliary | Is Coordinating conjuction | Is Determiner | Is Interjunction | Is Noun | Is Numeral | Is Particle | Is Pronoun | Is Proper Noun | Is Punctuation | Is Subordinating conjuction | Is Symbol | Is Verb | Is Other | Is Named Entity\n", - "False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False \n", - "---\n", - "Token: a\n", - "Starts with space | Capitalized | Is Adjective | Is Adposition | Is Adverb | Is Auxiliary | Is Coordinating conjuction | Is Determiner | Is Interjunction | Is Noun | Is Numeral | Is Particle | Is Pronoun | Is Proper Noun | Is Punctuation | Is Subordinating conjuction | Is Symbol | Is Verb | Is Other | Is Named Entity\n", - "False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False \n", - "---\n", - "Token: person\n", - "Starts with space | Capitalized | Is Adjective | Is Adposition | Is Adverb | Is Auxiliary | Is Coordinating conjuction | Is Determiner | Is Interjunction | Is Noun | Is Numeral | Is Particle | Is Pronoun | Is Proper Noun | Is Punctuation | Is Subordinating conjuction | Is Symbol | Is Verb | Is Other | Is Named Entity\n", - "False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False \n", - "---\n", - "Token: .\n", - "Starts with space | Capitalized | Is Adjective | Is Adposition | Is Adverb | Is Auxiliary | Is Coordinating conjuction | Is Determiner | Is Interjunction | Is Noun | Is Numeral | Is Particle | Is Pronoun | Is Proper Noun | Is Punctuation | Is Subordinating conjuction | Is Symbol | Is Verb | Is Other | Is Named Entity\n", - "False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False \n", - "---\n", - "\n", - "\n", - "5\n", - "[{'Starts with space': False, 'Capitalized': True, 'Is Adjective': False, 'Is Adposition': False, 'Is Adverb': False, 'Is Auxiliary': False, 'Is Coordinating conjuction': False, 'Is Determiner': False, 'Is Interjunction': False, 'Is Noun': False, 'Is Numeral': False, 'Is Particle': False, 'Is Pronoun': False, 'Is Proper Noun': True, 'Is Punctuation': False, 'Is Subordinating conjuction': False, 'Is Symbol': False, 'Is Verb': False, 'Is Other': False, 'Is Named Entity': True}, {'Starts with space': False, 'Capitalized': False, 'Is Adjective': False, 'Is Adposition': False, 'Is Adverb': False, 'Is Auxiliary': True, 'Is Coordinating conjuction': False, 'Is Determiner': False, 'Is Interjunction': False, 'Is Noun': False, 'Is Numeral': False, 'Is Particle': False, 'Is Pronoun': False, 'Is Proper Noun': False, 'Is Punctuation': False, 'Is Subordinating conjuction': False, 'Is Symbol': False, 'Is Verb': False, 'Is Other': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Adjective': False, 'Is Adposition': False, 'Is Adverb': False, 'Is Auxiliary': False, 'Is Coordinating conjuction': False, 'Is Determiner': True, 'Is Interjunction': False, 'Is Noun': False, 'Is Numeral': False, 'Is Particle': False, 'Is Pronoun': False, 'Is Proper Noun': False, 'Is Punctuation': False, 'Is Subordinating conjuction': False, 'Is Symbol': False, 'Is Verb': False, 'Is Other': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Adjective': False, 'Is Adposition': False, 'Is Adverb': False, 'Is Auxiliary': False, 'Is Coordinating conjuction': False, 'Is Determiner': False, 'Is Interjunction': False, 'Is Noun': True, 'Is Numeral': False, 'Is Particle': False, 'Is Pronoun': False, 'Is Proper Noun': False, 'Is Punctuation': False, 'Is Subordinating conjuction': False, 'Is Symbol': False, 'Is Verb': False, 'Is Other': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Adjective': False, 'Is Adposition': False, 'Is Adverb': False, 'Is Auxiliary': False, 'Is Coordinating conjuction': False, 'Is Determiner': False, 'Is Interjunction': False, 'Is Noun': False, 'Is Numeral': False, 'Is Particle': False, 'Is Pronoun': False, 'Is Proper Noun': False, 'Is Punctuation': True, 'Is Subordinating conjuction': False, 'Is Symbol': False, 'Is Verb': False, 'Is Other': False, 'Is Named Entity': False}]\n" - ] - } - ], - "source": [ - "sentences = [\n", - " \"Peter is a person.\"\n", - "]\n", - "labels = token_labelling.label_batch_sentences(sentences, tokenized=False, verbose=True)\n", - "\n", - "print(len(labels[0]))\n", - "print(labels[0])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now with our own tokenization. E.g. the one from our TinyStories models." - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "5\n", - "[{'Starts with space': False, 'Capitalized': True, 'Is Noun': True, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': False, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': True, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': False, 'Is Pronoun': False, 'Is Adjective': True, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': True, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': False, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}]\n" - ] - } - ], - "source": [ - "sentences = [\n", - " [\"This \", \"is \", \"a \", \"sentence\", \".\"]\n", - "]\n", - "labelled_sentences = token_labelling.label_batch_sentences(sentences, tokenized=True, verbose=False)\n", - "\n", - "print(len(labelled_sentences[0]))\n", - "print(labelled_sentences[0])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 2) Labelling all tokens in the dataset\n", - "\n", - "Now we want to label all the tokens that our tokenizer knows - its entire vocabulary.\n", - "\n", - "Using thy script in `scripts/label_all_tokens.py` we get the files:\n", - "- `src\\delphi\\eval\\all_tokens_list.txt`\n", - "- `src\\delphi\\eval\\labelled_token_ids_dict.pkl`\n", - "\n", - "Let's load the tokenizer so that we can look at the labelled tokens.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "c:\\Users\\joshu\\anaconda3\\envs\\delphi2\\lib\\site-packages\\transformers\\utils\\generic.py:441: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n", - " _torch_pytree._register_pytree_node(\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The vocab size is: 4096\n" - ] - } - ], - "source": [ - "# Get all the tokens of the tokenizer\n", - "from transformers import AutoTokenizer, PreTrainedTokenizer\n", - "\n", - "\n", - "# Decode a sentence\n", - "def decode(tokenizer: PreTrainedTokenizer, token_ids: list[int]) -> str:\n", - " return tokenizer.decode(token_ids, skip_special_tokens=True)\n", - "\n", - "model = \"delphi-suite/delphi-llama2-100k\"\n", - "tokenizer = AutoTokenizer.from_pretrained(model)\n", - "vocab_size = tokenizer.vocab_size\n", - "print(\"The vocab size is:\", vocab_size)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Load the pickle." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "import pickle\n", - "path = \"../src/delphi/eval/labelled_token_ids_dict.pkl\"\n", - "# load \n", - "with open(path, \"rb\") as f:\n", - " labelled_token_ids_dict = pickle.load(f)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Look at some random tokens and their labels" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The token id is: 2980\n", - "The decoded token is: four\n", - "The label is:\n", - "{'Capitalized': False,\n", - " 'Is Adjective': False,\n", - " 'Is Adposition': False,\n", - " 'Is Adverb': False,\n", - " 'Is Auxiliary': False,\n", - " 'Is Coordinating conjuction': False,\n", - " 'Is Determiner': False,\n", - " 'Is Interjunction': False,\n", - " 'Is Named Entity': False,\n", - " 'Is Noun': True,\n", - " 'Is Numeral': False,\n", - " 'Is Other': False,\n", - " 'Is Particle': False,\n", - " 'Is Pronoun': False,\n", - " 'Is Proper Noun': False,\n", - " 'Is Punctuation': False,\n", - " 'Is Subordinating conjuction': False,\n", - " 'Is Symbol': False,\n", - " 'Is Verb': False,\n", - " 'Starts with space': False}\n" - ] - } - ], - "source": [ - "import random\n", - "from pprint import pprint\n", - "# Get a random token id between 0 and 4000\n", - "token_id = random.randint(0, 4000)\n", - "# decode the token id\n", - "decoded_token = decode(tokenizer, [token_id])\n", - "# get the corresponding label\n", - "label = labelled_token_ids_dict[token_id]\n", - "# print the results\n", - "print(\"The token id is:\", token_id)\n", - "print(\"The decoded token is:\", decoded_token)\n", - "print(\"The label is:\")\n", - "pprint(label)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 3) Visualize the token label stats\n", - "\n", - "Let's have a look at the statistics." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import matplotlib.pyplot as plt # install matplotlib, if necessary\n", - "from tqdm.autonotebook import tqdm" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
token_idStarts with spaceCapitalizedIs AdjectiveIs AdpositionIs AdverbIs AuxiliaryIs Coordinating conjuctionIs DeterminerIs Interjunction...Is NumeralIs ParticleIs PronounIs Proper NounIs PunctuationIs Subordinating conjuctionIs SymbolIs VerbIs OtherIs Named Entity
00000000000...0000000010
11000000000...0000000010
22000000000...0000000010
33000000000...0000100000
44000000000...0000100000
\n", - "

5 rows × 21 columns

\n", - "
" - ], - "text/plain": [ - " token_id Starts with space Capitalized Is Adjective Is Adposition \\\n", - "0 0 0 0 0 0 \n", - "1 1 0 0 0 0 \n", - "2 2 0 0 0 0 \n", - "3 3 0 0 0 0 \n", - "4 4 0 0 0 0 \n", - "\n", - " Is Adverb Is Auxiliary Is Coordinating conjuction Is Determiner \\\n", - "0 0 0 0 0 \n", - "1 0 0 0 0 \n", - "2 0 0 0 0 \n", - "3 0 0 0 0 \n", - "4 0 0 0 0 \n", - "\n", - " Is Interjunction ... Is Numeral Is Particle Is Pronoun Is Proper Noun \\\n", - "0 0 ... 0 0 0 0 \n", - "1 0 ... 0 0 0 0 \n", - "2 0 ... 0 0 0 0 \n", - "3 0 ... 0 0 0 0 \n", - "4 0 ... 0 0 0 0 \n", - "\n", - " Is Punctuation Is Subordinating conjuction Is Symbol Is Verb Is Other \\\n", - "0 0 0 0 0 1 \n", - "1 0 0 0 0 1 \n", - "2 0 0 0 0 1 \n", - "3 1 0 0 0 0 \n", - "4 1 0 0 0 0 \n", - "\n", - " Is Named Entity \n", - "0 0 \n", - "1 0 \n", - "2 0 \n", - "3 0 \n", - "4 0 \n", - "\n", - "[5 rows x 21 columns]" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "\"\"\"\n", - " Create a pandas dataframe from the labelled token ids dictionary\n", - "\"\"\"\n", - "# each item in the dictionary is a tuple (token_id, label), where label: dict[str, float]\n", - "# the dataframe should have the columns: token_id, label1, label2 ... labelN\n", - "# label1, label2 ... labelN are the keys of the label dictionary\n", - "# the values of the label dictionary are the probabilities of the label\n", - "# here we go:\n", - "df = pd.DataFrame(labelled_token_ids_dict.items(), columns=[\"token_id\", \"label\"])\n", - "# split the label column into multiple columns\n", - "df = df.join(pd.DataFrame(df.pop('label').tolist()))\n", - "# Change datatype of columns to float\n", - "df = df.astype(int)\n", - "\n", - "df.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We perform a **sanity check** to assure that the code above was correct." - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "aac7894b3f61477bb96b9818757be9f4", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "0it [00:00, ?it/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# Perform sanity check, that the table was created correctly\n", - "for (row_index, row_values) in tqdm(df.iterrows()):\n", - " token_id = row_values.iloc[0]\n", - " label_pandas = list(row_values.iloc[1:]) # we exclude the token_id from the colum\n", - " label_dict = list(labelled_token_ids_dict[token_id].values())[:]\n", - " assert label_pandas == label_dict, f\"The dataframes are not equal for row {token_id}\\n{label_pandas}\\n{label_dict}\"" - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# visualize the occurences of TRUE per column\n", - "count = df[df==1].count()\n", - "index, values = count.index[1:], count.values[1:]\n", - "\n", - "plt.figure()\n", - "#_ = plt.pie(values, autopct='%1.1f%%')\n", - "plt.bar(index, values)\n", - "# the values are plotted above the bars\n", - "for i, value in enumerate(values):\n", - " plt.text(i, value+50, str(value), ha=\"center\", color=\"grey\")\n", - "plt.ylabel(\"Occurences\")\n", - "# rotate x labels\n", - "_ = plt.xticks(rotation=90)\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "venv_tinyevals", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/notebooks/training_demo.ipynb b/notebooks/training_demo.ipynb deleted file mode 100644 index 18c7e533..00000000 --- a/notebooks/training_demo.ipynb +++ /dev/null @@ -1,45 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from delphi.train.config.utils import load_preset\n", - "from delphi.train.training import run_training\n", - "from delphi.train.utils import ModelTrainingState\n", - "from delphi.train.run_context import RunContext\n", - "\n", - "\n", - "def train() -> tuple[ModelTrainingState, RunContext]:\n", - " config = load_preset(\"v0-llama2-100k\")\n", - " config.wandb_config.entity = \"jaiwithani\"\n", - " return run_training(config)\n", - "\n", - "model_train_result = train()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "tinyevals", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/notebooks/vis_demo.ipynb b/notebooks/vis_demo.ipynb deleted file mode 100644 index 842804d0..00000000 --- a/notebooks/vis_demo.ipynb +++ /dev/null @@ -1,148 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import torch; torch.set_grad_enabled(False)\n", - "from transformers import AutoTokenizer, AutoModelForCausalLM\n", - "\n", - "from delphi.eval.utils import tokenize, get_next_and_top_k_probs, load_validation_dataset\n", - "from delphi.eval.vis import vis_sample_prediction_probs\n", - "\n", - "model_name = \"roneneldan/TinyStories-1M\"\n", - "tokenizer = AutoTokenizer.from_pretrained(model_name)\n", - "model = AutoModelForCausalLM.from_pretrained(model_name)\n", - "ds = load_validation_dataset(\"tinystories-v2-clean\")\n", - "ds_txt = ds[\"story\"][:100]\n", - "ds_tok = [tokenize(tokenizer, txt) for txt in ds_txt]\n", - "sample_tok = ds_tok[0]\n", - "\n", - "correct_probs, top_3_probs = get_next_and_top_k_probs(model, sample_tok, k=3)\n", - "_, top_5_probs = get_next_and_top_k_probs(model, sample_tok, k=5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### collect top k predictions" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - "
<|endoftext|>
Once
 upon
 a
 time
,
 there
 was
 a
 kind
 girl
 named
 Lily
.
 Lily
 loved
 to
 mix
 things
.
 One
 day
,
 she
 found
 a
 big
 box
 full
 of
 colors
.
 Lily
 was
 very
 happy
.
\\n

L
ily
 took
 out
 a
 strip
 of
 red
 and
 a
 strip
 of
 blue
.
 She
 mixed
 them
 together
 and
 made
 a
 new
 color
,
 purple
!
 Lily
 was
 so
 excited
.
 She
 wanted
 to
 mix
 more
 colors
.
\\n

Next
,
 Lily
 took
 a
 strip
 of
 yellow
 and
 a
 strip
 of
 green
.
 She
 mixed
 them
 together
 and
 made
 a
 new
 color
,
 orange
!
 Lily
 was
 very
 proud
 of
 herself
.
 She
 showed
 her
 new
 colors
 to
 her
 mom
 and
 dad
,
 and
 they
 were
 proud
 of
 her
 too
.
 They
 all
 lived
 happily
 ever
 after
.
\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "_ = vis_sample_prediction_probs(sample_tok, correct_probs, top_3_probs, tokenizer)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - "
<|endoftext|>
Once
 upon
 a
 time
,
 there
 was
 a
 kind
 girl
 named
 Lily
.
 Lily
 loved
 to
 mix
 things
.
 One
 day
,
 she
 found
 a
 big
 box
 full
 of
 colors
.
 Lily
 was
 very
 happy
.
\\n

L
ily
 took
 out
 a
 strip
 of
 red
 and
 a
 strip
 of
 blue
.
 She
 mixed
 them
 together
 and
 made
 a
 new
 color
,
 purple
!
 Lily
 was
 so
 excited
.
 She
 wanted
 to
 mix
 more
 colors
.
\\n

Next
,
 Lily
 took
 a
 strip
 of
 yellow
 and
 a
 strip
 of
 green
.
 She
 mixed
 them
 together
 and
 made
 a
 new
 color
,
 orange
!
 Lily
 was
 very
 proud
 of
 herself
.
 She
 showed
 her
 new
 colors
 to
 her
 mom
 and
 dad
,
 and
 they
 were
 proud
 of
 her
 too
.
 They
 all
 lived
 happily
 ever
 after
.
\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "_ = vis_sample_prediction_probs(sample_tok, correct_probs, top_5_probs, tokenizer)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.7" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/pyproject.toml b/pyproject.toml index c7742381..dec159e3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,25 +5,14 @@ dependencies = [ "torch==2.1.2", "datasets==2.16.1", "tqdm==4.66.1", - "ipywidgets==8.1.1", - "nbformat==5.9.2", - "pytest==7.4.4", - "black==23.12.1", "jaxtyping==0.2.25", "beartype==0.18.2", - "pre-commit==3.6.0", - "isort==5.13.2", "chardet==5.2.0", - "sentencepiece==0.1.99", - "protobuf==4.25.2", "plotly==5.18.0", "wandb==0.16.3", - "spacy==3.7.2", - "pandas==1.3.4", "dacite==1.8.1", - "panel==1.4.0", - "jupyter_bokeh==4.0.1", "transformers==4.40.0", + "platformdirs==4.2.2" ] [project.optional-dependencies] @@ -31,6 +20,19 @@ mamba_cuda = [ "mamba_ssm==1.2.0.post1", "causal-conv1d==1.2.0.post2", ] +notebooks = [ + "ipykernel==6.29.4", + "panel==1.4.0", + "jupyter_bokeh==4.0.1", + "ipywidgets==8.1.1", + "nbformat==5.9.2", +] +dev = [ + "pytest==7.4.4", + "black==23.12.1", + "isort==5.13.2", + "pre-commit==3.6.0", +] [build-system] requires = ["setuptools", "wheel"] diff --git a/requirements-nocuda.txt b/requirements-nocuda.txt deleted file mode 100644 index d3052815..00000000 --- a/requirements-nocuda.txt +++ /dev/null @@ -1,2 +0,0 @@ -# this references packages specified by pyproject.toml -. diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 10c94e31..00000000 --- a/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -# this references dependencies in pyproject.toml (including optional cuda dependencies) -. -.[mamba_cuda] \ No newline at end of file diff --git a/scripts/get_next_logprobs.py b/scripts/get_next_logprobs.py index 5cf0d26e..d44b7015 100755 --- a/scripts/get_next_logprobs.py +++ b/scripts/get_next_logprobs.py @@ -9,7 +9,6 @@ from transformers import AutoModelForCausalLM from delphi import utils -from delphi.eval.utils import get_all_and_next_logprobs torch.set_grad_enabled(False) @@ -61,7 +60,7 @@ def get_logprobs_single_model( for i in trange(0, n_seq, batch_size): batch_tokens = dataset[i : i + batch_size][feature] logprobs[i : i + batch_size, 1:] = ( - get_all_and_next_logprobs(model, batch_tokens)[1].cpu().numpy() # type: ignore + utils.get_all_and_next_logprobs(model, batch_tokens)[1].cpu().numpy() # type: ignore ) return Dataset.from_dict({"logprobs": [row for row in logprobs]}) diff --git a/scripts/map_tokens.py b/scripts/map_tokens.py deleted file mode 100755 index b0393fa5..00000000 --- a/scripts/map_tokens.py +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env python3 - -import argparse - -import pandas as pd -from datasets import Dataset - -from delphi.eval.token_map import token_map -from delphi.eval.utils import load_validation_dataset - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="") - - parser.add_argument( - "dataset_name", - type=str, - help="Dataset from huggingface to run token_map on", - ) - parser.add_argument( - "--username", - type=str, - help="Hugging Face API username", - ) - parser.add_argument( - "--token", - type=str, - help="Hugging Face API token", - ) - parser.add_argument( - "--tokenizer-size", - type=int, - default=4096, - help="Size of the tokenizer", - ) - args = parser.parse_args() - - dataset = load_validation_dataset(args.dataset_name) - - hf_dataset = Dataset.from_dict( - {"prompt_pos_idx": token_map(dataset, args.tokenizer_size)} - ) - - repo_id = f"{args.username}/v0-token-map" # location in to hf - - hf_dataset.push_to_hub( - repo_id=repo_id, - split="validation", - private=False, - token=args.token, - ) diff --git a/scripts/spacy_label_all_tokens.py b/scripts/spacy_label_all_tokens.py deleted file mode 100644 index 3ff8ccda..00000000 --- a/scripts/spacy_label_all_tokens.py +++ /dev/null @@ -1,106 +0,0 @@ -import argparse -import pickle -from pathlib import Path - -import pandas as pd -from tqdm.auto import tqdm -from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast - -from delphi.eval import spacy_token_labelling - - -def tokenize( - tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast, sample_txt: str -) -> int: - # supposedly this can be different than prepending the bos token id - return tokenizer.encode(tokenizer.bos_token + sample_txt, return_tensors="pt")[0] - - -# Decode a sentence -def decode( - tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast, token_ids: int | list[int] -) -> str: - return tokenizer.decode(token_ids, skip_special_tokens=True) - - -def main(): - # Setup argparse - parser = argparse.ArgumentParser(description="Tokenization and labeling utility.") - parser.add_argument( - "--model-name", - type=str, - help="Name of the model to use for tokenization and labeling.", - default="delphi-suite/delphi-llama2-100k", - required=False, - ) - parser.add_argument( - "--save-dir", type=str, help="Directory to save the results.", required=True - ) - args = parser.parse_args() - - # Access command-line arguments - # Directory to save the results - save_dir = Path(args.save_dir) - save_dir.mkdir(parents=True, exist_ok=True) # create directory if it does not exist - model_name = args.model_name - - print("\n", " LABEL ALL TOKENS ".center(50, "="), "\n") - print(f"You chose the model: {model_name}\n") - print( - f"The language model will be loaded from Huggingface and its tokenizer used to do two things:\n\t1) Create a list of all tokens in the tokenizer's vocabulary.\n\t2) Label each token with its part of speech, dependency, and named entity recognition tags.\nThe respective results will be saved to files located at: '{save_dir}'\n" - ) - - # ================ (1) ================= - print("(1) Create a list of all tokens in the tokenizer's vocabulary ...") - - # Load the tokenizer from Huggingface - tokenizer = AutoTokenizer.from_pretrained(model_name) - print("Loaded the tokenizer.\nThe vocab size is:", tokenizer.vocab_size) - - ( - tokens_str, - labelled_token_ids_dict, - ) = spacy_token_labelling.label_tokens_from_tokenizer(tokenizer) - - # Save the list of all tokens to a file - filename = "all_tokens_list.txt" - filepath = save_dir / filename # TODO: use the static files of python module - with open(filepath, "w", encoding="utf-8") as f: - f.write(tokens_str) - - print(f"Saved the list of all tokens to:\n\t{filepath}\n") - - # ================ (2) ================= - print("(2) Label each token ...") - - print("\nCreating the CSV ...") - - df = spacy_token_labelling.convert_label_dict_to_df(labelled_token_ids_dict) - - print("Sanity check pandas csv ...", end="") - # Perform sanity check, that the table was created correctly - for row_index, row_values in df.iterrows(): - token_id = row_values.iloc[0] - label_pandas = list( - row_values.iloc[1:] - ) # we exclude the token_id from the colum - label_dict = list(labelled_token_ids_dict[token_id].values())[:] - assert ( - label_pandas == label_dict - ), f"The dataframes are not equal for row {token_id}\n{label_pandas}\n{label_dict}" - print(" completed.") - - # TODO: Fix the issue with disappearing spaces when exporting DataFrame to CSV. - # There's a known problem where no token is classified as "starting with a space". - - # save the dataframe to a csv - filename = "spacy_labelled_token_ids.csv" - filepath = save_dir / filename - df.to_csv(filepath, index=False) - print(f"Saved the labelled tokens as CSV to:\n\t{filepath}\n") - - print(" END ".center(50, "=")) - - -if __name__ == "__main__": - main() diff --git a/scripts/tokenize_dataset.py b/scripts/tokenize_dataset.py index a74df312..627a5951 100755 --- a/scripts/tokenize_dataset.py +++ b/scripts/tokenize_dataset.py @@ -4,28 +4,32 @@ import os from pathlib import Path -from datasets import Dataset, Features, Value, load_dataset +from datasets import Dataset from huggingface_hub import HfApi from transformers import AutoTokenizer -from delphi.dataset.tokenization import get_tokenized_chunks +from delphi import utils +from delphi.tokenization import get_tokenized_chunks if __name__ == "__main__": - parser = argparse.ArgumentParser(description="", allow_abbrev=False) + parser = argparse.ArgumentParser( + description="Tokenize a text dataset using a specified tokenizer", + allow_abbrev=False, + ) parser.add_argument( - "--in-repo-id", + "--in-dataset", "-i", type=str, required=True, - help="Text dataset from huggingface to tokenize", + help="Dataset you want to tokenize. Local path or HF repo id", ) parser.add_argument( "--feature", "-f", type=str, required=True, - help="Name of the column containing text documents in the input dataset", + help="Name of the feature (column) containing text documents in the input dataset", ) parser.add_argument( "--split", @@ -34,18 +38,6 @@ required=True, help="Split of the dataset to be tokenized, supports slicing like 'train[:10%%]'", ) - parser.add_argument( - "--out-dir", - type=str, - required=False, - help="Local directory to save the resulting dataset", - ) - parser.add_argument( - "--out-repo-id", - type=str, - required=False, - help="HF repo id to upload the resulting dataset", - ) parser.add_argument( "--tokenizer", "-t", @@ -58,32 +50,39 @@ "-l", type=int, required=True, - help="Context size of the tokenized dataset as input of the model", + help="Length of the tokenized sequences", ) parser.add_argument( "--batch-size", "-b", type=int, default=50, - help="Size of input into batched tokenization", + help="How many text documents to tokenize at once (default: 50)", ) parser.add_argument( "--chunk-size", "-c", type=int, default=200_000, - help="Size of the parquet chunks uploaded to HuggingFace", + help="Maximum number of tokenized sequences in a single parquet file (default: 200_000)", + ) + parser.add_argument( + "--out-dir", + type=str, + required=False, + help="Local directory to save the resulting dataset", + ) + parser.add_argument( + "--out-repo", + type=str, + required=False, + help="HF repo id to upload the resulting dataset", ) args = parser.parse_args() - assert ( - args.out_repo_id or args.out_dir - ), "You need to provide --out-repo-id or --out-dir" + assert args.out_repo or args.out_dir, "You need to provide --out-repo or --out-dir" - print(f"Loading dataset '{args.in_repo_id}'...") - in_dataset_split = load_dataset( - args.in_repo_id, - split=args.split, - features=Features({args.feature: Value("string")}), + in_dataset_split = utils.load_dataset_split_string_feature( + args.in_dataset, args.split, args.feature ) assert isinstance(in_dataset_split, Dataset) print(f"Loading tokenizer from '{args.tokenizer}'...") @@ -92,9 +91,9 @@ assert tokenizer.eos_token_id is not None, "Tokenizer must have a eos_token_id" api = None - if args.out_repo_id: + if args.out_repo: api = HfApi() - api.create_repo(repo_id=args.out_repo_id, repo_type="dataset", exist_ok=True) + api.create_repo(repo_id=args.out_repo, repo_type="dataset", exist_ok=True) if args.out_dir: os.makedirs(args.out_dir, exist_ok=True) @@ -107,7 +106,7 @@ ) print(f"Tokenizing split='{args.split}'...") - split_name = args.split.split("[")[0] + split_name = utils.hf_split_to_split_name(args.split) for chunk_idx, ds_chunk in enumerate(ds_chunks_it): chunk_name = f"{split_name}-{chunk_idx:05}.parquet" if args.out_dir: @@ -117,11 +116,11 @@ ds_parquet_chunk = io.BytesIO() ds_chunk.to_parquet(ds_parquet_chunk) if api: - print(f"Uploading '{chunk_name}' to '{args.out_repo_id}'...") + print(f"Uploading '{chunk_name}' to '{args.out_repo}'...") api.upload_file( path_or_fileobj=ds_parquet_chunk, path_in_repo=f"data/{chunk_name}", - repo_id=args.out_repo_id, + repo_id=args.out_repo, repo_type="dataset", ) print(f"Done saving/uploading '{chunk_name}'") diff --git a/scripts/run_training.py b/scripts/train_model.py similarity index 73% rename from scripts/run_training.py rename to scripts/train_model.py index 98e126ed..ffe0af0c 100755 --- a/scripts/run_training.py +++ b/scripts/train_model.py @@ -3,14 +3,10 @@ import logging import sys from pathlib import Path -from typing import Any -from delphi.train.config import ( - build_config_from_files_and_overrides, - dot_notation_to_dict, -) +from delphi.train.config import build_config_from_files_and_overrides from delphi.train.training import run_training -from delphi.train.utils import save_results +from delphi.train.utils import overrides_to_dict, save_results def add_logging_args(parser: argparse.ArgumentParser): @@ -51,23 +47,21 @@ def set_logging(args: argparse.Namespace): def setup_parser() -> argparse.ArgumentParser: # Setup argparse - parser = argparse.ArgumentParser(description="Train a delphi model") + parser = argparse.ArgumentParser( + description="Train a delphi model", allow_abbrev=False + ) parser.add_argument( - "--config_files", - "--config_file", - "-c", + "config_files", help=( - "Path to json file(s) containing config values. Specific values can be overridden with --overrides. " - "e.g. `--config_files primary_config.json secondary_config.json" + "Path to json file(s) containing config values, e.g. 'primary_config.json secondary_config.json'." ), type=str, - required=False, nargs="*", ) parser.add_argument( "--overrides", help=( - "Override config values with comma-separated declarations. " + "Override config values with space-separated declarations. " "e.g. `--overrides model_config.hidden_size=42 run_name=foo`" ), type=str, @@ -79,12 +73,6 @@ def setup_parser() -> argparse.ArgumentParser: return parser -def overrides_to_dict(overrides: list[str]) -> dict[str, Any]: - # ["a.b.c=4", "foo=false"] to {"a": {"b": {"c": 4}}, "foo": False} - config_vars = {k: v for k, v in [x.split("=") for x in overrides if "=" in x]} - return dot_notation_to_dict(config_vars) - - def main(): parser = setup_parser() args = parser.parse_args() diff --git a/scripts/train_tokenizer.py b/scripts/train_tokenizer.py index 83e071ae..a82be7c3 100755 --- a/scripts/train_tokenizer.py +++ b/scripts/train_tokenizer.py @@ -1,10 +1,12 @@ #!/usr/bin/env python3 import argparse -from datasets import Dataset, Features, Value, load_dataset +from datasets import Dataset, Features, Value from tokenizers import ByteLevelBPETokenizer # type: ignore from transformers import PreTrainedTokenizerFast +from delphi import utils + def train_byte_level_bpe( dataset: Dataset, feature: str, vocab_size: int @@ -27,14 +29,17 @@ def train_byte_level_bpe( if __name__ == "__main__": - parser = argparse.ArgumentParser(description="", allow_abbrev=False) + parser = argparse.ArgumentParser( + description="Train a custom, reversible, BPE tokenizer (GPT2-like). You need to provide --out-repo or --out-dir.", + allow_abbrev=False, + ) parser.add_argument( - "--in-repo-id", + "--in-dataset", "-i", type=str, required=True, - help="Input dataset", + help="Dataset you want to train the tokenizer on. Local path or HF repo id", ) parser.add_argument( "--feature", @@ -64,21 +69,16 @@ def train_byte_level_bpe( help="Local directory to save the resulting tokenizer", ) parser.add_argument( - "--out-repo-id", + "--out-repo", type=str, required=False, help="HF repo id to upload the resulting tokenizer", ) args = parser.parse_args() - assert ( - args.out_repo_id or args.out_dir - ), "You need to provide out_repo_id or out_dir" + assert args.out_repo or args.out_dir, "You need to provide --out-repo or --out-dir" - print(f"Loading dataset '{args.in_repo_id}'...") - in_dataset_split = load_dataset( - args.in_repo_id, - split=args.split, - features=Features({args.feature: Value("string")}), + in_dataset_split = utils.load_dataset_split_string_feature( + args.in_dataset, args.split, args.feature ) assert isinstance(in_dataset_split, Dataset) tokenizer = train_byte_level_bpe( @@ -90,9 +90,9 @@ def train_byte_level_bpe( print(f"Saving tokenizer to '{args.out_dir}' directory...") tokenizer.save_pretrained(args.out_dir) print("Done.") - if args.out_repo_id: - print(f"Pushing tokenizer to HF repo '{args.out_repo_id}'...") + if args.out_repo: + print(f"Pushing tokenizer to HF repo '{args.out_repo}'...") tokenizer.push_to_hub( - repo_id=args.out_repo_id, + repo_id=args.out_repo, ) print("Done.") diff --git a/scripts/training_run.sh b/scripts/training_run.sh deleted file mode 100644 index 7d1b2fe8..00000000 --- a/scripts/training_run.sh +++ /dev/null @@ -1,6 +0,0 @@ -counter=1 -for config in 4-76.json 6-112 6-204 -do - CUDA_VISIBLE_DEVICES=$counter CUBLAS_WORKSPACE_CONFIG=:4096:8 python scripts/run_training.py --config scripts/$config & > $config.log - counter=$((counter+1)) -done diff --git a/scripts/validate_configs.py b/scripts/validate_configs.py index 46d3f5d1..86b6b4f6 100755 --- a/scripts/validate_configs.py +++ b/scripts/validate_configs.py @@ -3,6 +3,7 @@ import pathlib from delphi.train.config import build_config_from_files_and_overrides +from delphi.train.utils import init_model, overrides_to_dict def get_config_path_with_base(config_path: pathlib.Path) -> list[pathlib.Path]: @@ -33,15 +34,32 @@ def main(): type=str, help="path to a training config json or directory of training config jsons", ) + parser.add_argument( + "--overrides", + help=( + "Override config values with space-separated declarations. " + "e.g. `--overrides model_config.hidden_size=42 run_name=foo`" + ), + type=str, + required=False, + nargs="*", + default=[], + ) + parser.add_argument("--init", help="initialize the model", action="store_true") args = parser.parse_args() config_paths = get_config_paths(args.config_path) print( f"validating configs: {' | '.join(str(config_path[-1]) for config_path in config_paths)}" ) + overrides = overrides_to_dict(args.overrides) errors = [] + sizes = [] for config_path in config_paths: try: - build_config_from_files_and_overrides(config_path, {}) + config = build_config_from_files_and_overrides(config_path, overrides) + if args.init: + model = init_model(config.model_config, seed=config.torch_seed) + sizes.append((config_path, model.num_parameters())) except Exception as e: errors.append((config_path, e)) continue @@ -51,6 +69,10 @@ def main(): print(f" {config_path[-1]}: {e}") else: print("all configs loaded successfully") + if sizes: + print("model sizes:") + for config_path, size in sizes: + print(f" {config_path[-1]}: {size}") if __name__ == "__main__": diff --git a/setup.py b/setup.py index 4a92f04d..5dd7948a 100644 --- a/setup.py +++ b/setup.py @@ -2,8 +2,8 @@ setup( name="delphi", - packages=find_packages(where="src"), - package_dir={"": "src"}, + packages=find_packages(where="."), + package_dir={"": "."}, package_data={ "delphi": ["test_configs/**/*"], }, diff --git a/src/delphi/__init__.py b/src/delphi/__init__.py deleted file mode 100644 index a0ea3bb4..00000000 --- a/src/delphi/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from beartype.claw import beartype_this_package # <-- hype comes - -beartype_this_package() # <-- hype goes - -__version__ = "0.1.1" diff --git a/src/delphi/dummy.py b/src/delphi/dummy.py deleted file mode 100644 index 028ff372..00000000 --- a/src/delphi/dummy.py +++ /dev/null @@ -1,12 +0,0 @@ -import torch -from jaxtyping import Float, Int - -Type1 = Float[torch.Tensor, "dim"] -Type2 = Int[torch.Tensor, "batch dim"] - - -def dummy(arg: Type1 | Type2) -> Type1: - if isinstance(arg, Type1): - return arg + 1 - elif isinstance(arg, Type2): - return arg[0] - 0.1 diff --git a/src/delphi/eval/calc_model_group_stats.py b/src/delphi/eval/calc_model_group_stats.py deleted file mode 100644 index d9c5d4c1..00000000 --- a/src/delphi/eval/calc_model_group_stats.py +++ /dev/null @@ -1,54 +0,0 @@ -import numpy as np - - -def calc_model_group_stats( - tokenized_corpus_dataset: list, - logprobs_by_dataset: dict[str, list[list[float]]], - token_labels_by_token: dict[int, dict[str, bool]], - token_labels: list[str], -) -> dict[tuple[str, str], dict[str, float]]: - """ - For each (model, token group) pair, calculate useful stats (for visualization) - - args: - - tokenized_corpus_dataset: the tokenized corpus dataset, e.g. load_dataset(constants.tokenized_corpus_dataset))["validation"] - - logprob_datasets: a dict of lists of logprobs, e.g. {"llama2": load_dataset("transcendingvictor/llama2-validation-logprobs")["validation"]["logprobs"]} - - token_groups: a dict of token groups, e.g. {0: {"Is Noun": True, "Is Verb": False, ...}, 1: {...}, ...} - - models: a list of model names, e.g. constants.LLAMA2_MODELS - - token_labels: a list of token group descriptions, e.g. ["Is Noun", "Is Verb", ...] - - returns: a dict of (model, token group) pairs to a dict of stats, - e.g. {("llama2", "Is Noun"): {"mean": -0.5, "median": -0.4, "min": -0.1, "max": -0.9, "25th": -0.3, "75th": -0.7}, ...} - - Technically `models` and `token_labels` are redundant, as they are also keys in `logprob_datasets` and `token_groups`, - but it's better to be explicit - - stats calculated: mean, median, min, max, 25th percentile, 75th percentile - """ - model_group_stats = {} - for model in logprobs_by_dataset: - group_logprobs = {} - print(f"Processing model {model}") - dataset = logprobs_by_dataset[model] - for ix_doc_lp, document_lps in enumerate(dataset): - tokens = tokenized_corpus_dataset[ix_doc_lp]["tokens"] - for ix_token, token in enumerate(tokens): - if ix_token == 0: # skip the first token, which isn't predicted - continue - logprob = document_lps[ix_token] - for token_group_desc in token_labels: - if token_labels_by_token[token][token_group_desc]: - if token_group_desc not in group_logprobs: - group_logprobs[token_group_desc] = [] - group_logprobs[token_group_desc].append(logprob) - for token_group_desc in token_labels: - if token_group_desc in group_logprobs: - model_group_stats[(model, token_group_desc)] = { - "mean": np.mean(group_logprobs[token_group_desc]), - "median": np.median(group_logprobs[token_group_desc]), - "min": np.min(group_logprobs[token_group_desc]), - "max": np.max(group_logprobs[token_group_desc]), - "25th": np.percentile(group_logprobs[token_group_desc], 25), - "75th": np.percentile(group_logprobs[token_group_desc], 75), - } - return model_group_stats diff --git a/src/delphi/eval/compare_models.py b/src/delphi/eval/compare_models.py deleted file mode 100644 index e03b300c..00000000 --- a/src/delphi/eval/compare_models.py +++ /dev/null @@ -1,91 +0,0 @@ -from dataclasses import dataclass - -import torch -from jaxtyping import Int -from transformers import PreTrainedModel - -from delphi.eval.utils import get_all_and_next_logprobs_single - - -def identify_model(model: PreTrainedModel) -> str: - return model.config.name_or_path - - -@dataclass -class TokenPrediction: - token: int - base_model_prob: float - lift_model_prob: float - - -@dataclass -class NextTokenStats: - base_model: str - lift_model: str - next_prediction: TokenPrediction - topk: list[TokenPrediction] - - -def compare_models( - model_a: PreTrainedModel, - model_b: PreTrainedModel, - sample_tok: Int[torch.Tensor, "seq"], - top_k: int = 3, -) -> list[NextTokenStats | None]: - """ - Compare the probabilities of the next token for two models and get the top k token predictions according to model B. - Args: - - model_a: The first model (assumed to be the base model) - - model_b: The second model (assumed to be the improved model) - - sample_tok: The tokenized prompt - - top_k: The number of top token predictions to retrieve (default is 5) - Returns: - A list of NextTokenStats objects, one for each token in the prompt. - Tensors are aligned to the token they are predicting (by prepending a -1 to the start of the tensor) - """ - assert ( - model_a.device == model_b.device - ), "Both models must be on the same device for comparison." - - device = model_a.device - sample_tok = sample_tok.to(device) - - logprobs_a, next_probs_a = get_all_and_next_logprobs_single(model_a, sample_tok) - logprobs_b, next_probs_b = get_all_and_next_logprobs_single(model_b, sample_tok) - - probs_a = torch.exp(logprobs_a) - probs_b = torch.exp(logprobs_b) - - top_k_b = torch.topk(probs_b, top_k, dim=-1) - top_k_a_probs = torch.gather(probs_a, 1, top_k_b.indices) - - top_k_b_tokens = top_k_b.indices - top_k_b_probs = top_k_b.values - - comparisons = [] - # ignore first token when evaluating predictions - comparisons.append(None) - - for next_p_a, next_p_b, top_toks_b, top_probs_a, top_probs_b in zip( - next_probs_a, next_probs_b, top_k_b_tokens, top_k_a_probs, top_k_b_probs - ): - nts = NextTokenStats( - base_model=identify_model(model_a), - lift_model=identify_model(model_b), - next_prediction=TokenPrediction( - token=int(next_p_a.item()), - base_model_prob=next_p_a.item(), - lift_model_prob=next_p_b.item(), - ), - topk=[ - TokenPrediction( - token=int(top_toks_b[i].item()), - base_model_prob=top_probs_a[i].item(), - lift_model_prob=top_probs_b[i].item(), - ) - for i in range(top_k) - ], - ) - comparisons.append(nts) - - return comparisons diff --git a/src/delphi/eval/constants.py b/src/delphi/eval/constants.py deleted file mode 100644 index 5cd3daf1..00000000 --- a/src/delphi/eval/constants.py +++ /dev/null @@ -1,26 +0,0 @@ -corpus_dataset = "delphi-suite/tinystories-v2-clean" -tokenized_corpus_dataset = "delphi-suite/tinystories-v2-clean-tokenized-v0" - -LLAMA2_MODELS = [ - "delphi-llama2-100k", - "delphi-llama2-200k", - "delphi-llama2-400k", - "delphi-llama2-800k", - "delphi-llama2-1.6m", - "delphi-llama2-3.2m", - "delphi-llama2-6.4m", - "delphi-llama2-12.8m", - "delphi-llama2-25.6m", -] - -LLAMA2_NEXT_LOGPROBS_DATASETS_MAP = { - "llama2-100k": "delphi-suite/v0-next-logprobs-llama2-100k", - "llama2-200k": "delphi-suite/v0-next-logprobs-llama2-200k", - "llama2-400k": "delphi-suite/v0-next-logprobs-llama2-400k", - "llama2-800k": "delphi-suite/v0-next-logprobs-llama2-800k", - "llama2-1.6m": "delphi-suite/v0-next-logprobs-llama2-1.6m", - "llama2-3.2m": "delphi-suite/v0-next-logprobs-llama2-3.2m", - "llama2-6.4m": "delphi-suite/v0-next-logprobs-llama2-6.4m", - "llama2-12.8m": "delphi-suite/v0-next-logprobs-llama2-12.8m", - "llama2-25.6m": "delphi-suite/v0-next-logprobs-llama2-25.6m", -} diff --git a/src/delphi/eval/spacy_token_labelling.py b/src/delphi/eval/spacy_token_labelling.py deleted file mode 100644 index a9a82193..00000000 --- a/src/delphi/eval/spacy_token_labelling.py +++ /dev/null @@ -1,315 +0,0 @@ -from collections.abc import Callable -from pathlib import Path -from typing import Optional - -import pandas as pd -import spacy -from spacy.tokens import Doc, Token -from spacy.util import is_package -from tqdm.auto import tqdm -from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast - -# make sure the english language model capabilities are installed by the equivalent of: -# python -m spacy download en_core_web_sm -# Should be run once, initially. Download only starts if not already installed. -SPACY_MODEL = "en_core_web_sm" # small: "en_core_web_sm", large: "en_core_web_trf" -NLP = None # global var to hold the language model -if not is_package(SPACY_MODEL): - spacy.cli.download(SPACY_MODEL, False, False) - - -TOKEN_LABELS: dict[str, Callable] = { - # --- custom categories --- - "Starts with space": (lambda token: token.text.startswith(" ")), # bool - "Capitalized": (lambda token: token.text[0].isupper()), # bool - # --- POS (part-of-speech) categories --- - # They include the Universal POS tags (https://universaldependencies.org/u/pos/) - # -> "POS Tag": (lambda token: token.pos_), # 'NOUN', 'VB', .. - "Is Adjective": (lambda token: token.pos_ == "ADJ"), - "Is Adposition": (lambda token: token.pos_ == "ADP"), - "Is Adverb": (lambda token: token.pos_ == "ADV"), - "Is Auxiliary": (lambda token: token.pos_ == "AUX"), - "Is Coordinating conjuction": (lambda token: token.pos_ == "CCONJ"), - "Is Determiner": (lambda token: token.pos_ == "DET"), - "Is Interjunction": (lambda token: token.pos_ == "INTJ"), - "Is Noun": (lambda token: token.pos_ == "NOUN"), - "Is Numeral": (lambda token: token.pos_ == "NUM"), - "Is Particle": (lambda token: token.pos_ == "PART"), - "Is Pronoun": (lambda token: token.pos_ == "PRON"), - "Is Proper Noun": (lambda token: token.pos_ == "PROPN"), - "Is Punctuation": (lambda token: token.pos_ == "PUNCT"), - "Is Subordinating conjuction": (lambda token: token.pos_ == "SCONJ"), - "Is Symbol": (lambda token: token.pos_ == "SYM"), - "Is Verb": (lambda token: token.pos_ == "VERB"), - "Is Other": (lambda token: token.pos_ == "X"), - # --- dependency categories --- - # -> "Dependency": (lambda token: token.dep_), # 'nsubj', 'ROOT', 'dobj', .. - # "Is Subject": (lambda token: token.dep_ == "nsubj"), - # "Is Object": (lambda token: token.dep_ == "dobj"), - # "Is Root": ( - # lambda token: token.dep_ == "ROOT" - # ), # root of the sentence (often a verb) - # "Is auxiliary": (lambda token: token.dep_ == "aux"), - # --- Named entity recognition (NER) categories --- - # "Named Entity Type": (lambda token: token.ent_type_), # '', 'PERSON', 'ORG', 'GPE', .. - "Is Named Entity": (lambda token: token.ent_type_ != ""), -} - - -def explain_token_labels(token: Optional[Token] = None) -> None: - """ - Prints the explanation of a specific token's labels or of ALL - possible labels (POS, dependency, NER, ...), if no token is provided. - - Parameters - ---------- - token : Optional[Token], optional - The token, whose labels should be explained. If None, all labels - possible labels are explained, by default None. - """ - if token is not None: - # get token labels - labels = label_single_token(token) - print(" Explanation of token labels ".center(45, "-")) - print("Token text:".ljust(20), token.text) - print("Token dependency:".ljust(20), spacy.glossary.explain(token.dep_)) - print("Token POS:".ljust(20), spacy.glossary.explain(token.pos_)) - print(" Token labels ".center(45, "-")) - for i, (label_name, value) in enumerate(labels.items()): - print(f" {i:2} ", label_name.ljust(20), value) - - else: - glossary = spacy.glossary.GLOSSARY - print( - f"Explanation of all {len(glossary.keys())} token labels (POS, dependency, NER, ...):" - ) - for label, key in glossary.items(): - print(" ", label.ljust(10), key) - - -def label_single_token(token: Token | None) -> dict[str, bool]: - """ - Labels a single token. A token, that has been analyzed by the spaCy - library. - - Parameters - ---------- - token : Token | None - The token to be labelled. - - Returns - ------- - dict[str, bool] - Returns a dictionary with the token's labels as keys and their - corresponding boolean values. - """ - labels = dict() # The dict holding labels of a single token - # if token is None, then it is a '' empty strong token or similar - if token is None: - for label_name, category_check in TOKEN_LABELS.items(): - labels[label_name] = False - labels["Is Other"] = True - return labels - # all other cases / normal tokens - for label_name, category_check in TOKEN_LABELS.items(): - labels[label_name] = category_check(token) - return labels - - -def label_sentence(tokens: Doc | list[Token]) -> list[dict[str, bool]]: - """ - Labels spaCy Tokens in a sentence. Takes the context of the token into account - for dependency labels (e.g. subject, object, ...), IF dependency labels are turned on. - - Parameters - ---------- - tokens : list[Token] - A list of tokens. - - Returns - ------- - list[dict[str, bool]] - Returns a list of the tokens' labels. - """ - labelled_tokens = list() # list holding labels for all tokens of sentence - # if the list is empty it is because token is '' empty string or similar - if len(tokens) == 0: - labels = label_single_token(None) - labelled_tokens.append(labels) - return labelled_tokens - # in all other cases - for token in tokens: - labels = label_single_token(token) - labelled_tokens.append(labels) - return labelled_tokens - - -def label_batch_sentences( - sentences: list[str] | list[list[str]], - tokenized: bool = True, - verbose: bool = False, -) -> list[list[dict[str, bool]]]: - """ - Labels tokens in a sentence batchwise. Takes the context of the token into - account for dependency labels (e.g. subject, object, ...). - - Parameters - ---------- - sentences : list - A batch/list of sentences, each being a list of tokens. - tokenized : bool, optional - Whether the sentences are already tokenized, by default True. If the sentences - are full strings and not lists of tokens, then set to False. If true then `sentences` must be list[list[str]]. - verbose : bool, optional - Whether to print the tokens and their labels to the console, by default False. - - Returns - ------- - list[list[dict[str, bool]] - Returns a list of sentences. Each sentence contains a list of its - corresponding token length where each entry provides the labels/categories - for the token. Sentence -> Token -> Labels - """ - global NLP, SPACY_MODEL - - if NLP is None: - # Load english language model - NLP = spacy.load(SPACY_MODEL) - # labelled tokens, list holding sentences holding tokens holding corresponding token labels - labelled_sentences: list[list[dict[str, bool]]] = list() - - # go through each sentence in the batch - for sentence in sentences: - if tokenized: - # sentence is a list of tokens - doc = Doc(NLP.vocab, words=sentence) # type: ignore - # Apply the spaCy pipeline, except for the tokenizer - for name, proc in NLP.pipeline: - if name != "tokenizer": - doc = proc(doc) - else: - # sentence is a single string - doc = NLP(sentence) # type: ignore - - labelled_tokens = list() # list holding labels for all tokens of sentence - labelled_tokens = label_sentence(doc) - - # print the token and its labels to console - if verbose is True: - # go through each token in the sentence - for token, labelled_token in zip(doc, labelled_tokens): - print(f"Token: {token}") - print(" | ".join(list(TOKEN_LABELS.keys()))) - printable = [ - str(l).ljust(len(name)) for name, l in labelled_token.items() - ] - printable = " | ".join(printable) - print(printable) - print("---") - # add current sentence's tokens' labels to the list - labelled_sentences.append(labelled_tokens) - - if verbose is True: - print("\n") - - return labelled_sentences - - -def label_tokens_from_tokenizer( - tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast, -) -> tuple[str, dict[int, dict[str, bool]]]: - """ - Labels all tokens in a tokenizer's vocabulary with the corresponding token categories (POS, named entity, etc). Returns two things: 1) `tokens_str`, a string where each token comprises 'token_id,token_str\n' and 2) `labelled_token_ids_dict` a dict that contains for each token_id (key) the corresponding token labels, which is in turn a dict, whith the label categories as keys and their boolean values as the dict's values. - - Parameters - ---------- - tokenizer : The tokenizer with its tokens to be labelled. - - Returns - ------- - tokens_str, labelled_token_ids_dict - - """ - - def decode( - tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast, - token_ids: int | list[int], - ) -> str: - return tokenizer.decode(token_ids, skip_special_tokens=True) - - vocab_size = tokenizer.vocab_size - - # 1) Create a list of all tokens in the tokenizer's vocabulary - tokens_str = "" # will hold all tokens and their ids - for i in range(vocab_size): - tokens_str += f"{i},{decode(tokenizer, i)}\n" - - # 2) let's label each token - labelled_token_ids_dict = {} # token_id: labels - max_token_id = vocab_size # stop at which token id, vocab size - # we iterate over all token_ids individually - for token_id in tqdm(range(0, max_token_id), desc="Labelling tokens"): - # decode the token_ids to get a list of tokens, a 'sentence' - token = decode(tokenizer, token_id) # list of tokens == sentence - # put the sentence into a list, to make it a batch of sentences - sentences = [token] - # label the batch of sentences - labels = label_batch_sentences(sentences, tokenized=True, verbose=False) - # create a dict with the token_ids and their labels - # update the labelled_token_ids_dict with the new dict - label = labels[0][0] # first sentence of batch, label of first token - labelled_token_ids_dict[token_id] = label - - return tokens_str, labelled_token_ids_dict - - -def import_token_labels(path: str | Path): - """ - Imports token labels from a *.csv file. - - Parameters - ---------- - path : str | Path - The path to the file. - - Returns - ------- - dict[int, dict[str, bool]] - Returns the labelled tokens dict. Each token_id has its own dict having the labels. - """ - if isinstance(path, str): - path = Path(path) - # make sure the file_type is compatible - file_type = path.suffix - assert ( - file_type == ".csv" - ), f"Invalid file type. Allowed: csv, pkl. Got: {file_type}" - # make sure file exists - if not path.exists(): - raise FileNotFoundError(f"There is no file under {path}") - - df = pd.read_csv(str(path)) - categories = list(df.columns[1:]) # excluding first column: token_id - loaded_label_dict: dict[int, dict[str, bool]] = {} - # go through each row and construct the dict - for _, row in df.iterrows(): - token_id = int(row["token_id"]) - labels = {cat: bool(row[cat] == 1) for cat in categories} - loaded_label_dict[token_id] = labels - - return loaded_label_dict - - -def convert_label_dict_to_df( - labelled_token_ids_dict: dict[int, dict[str, bool]] -) -> pd.DataFrame: - """ - Takes a `labelled_token_ids_dict` and converts it into a Pandas Dataframe. - """ - df = pd.DataFrame(labelled_token_ids_dict.items(), columns=["token_id", "label"]) - # split the label column into multiple columns - df = df.join(pd.DataFrame(df.pop("label").tolist())) - # Change datatype of columns to float - df = df.astype(int) - - return df diff --git a/src/delphi/eval/token_map.py b/src/delphi/eval/token_map.py deleted file mode 100644 index 4ac7b0df..00000000 --- a/src/delphi/eval/token_map.py +++ /dev/null @@ -1,18 +0,0 @@ -import os -from typing import cast - -from datasets import Dataset - - -def token_map( - tokenized_dataset: Dataset, - tokenizer_size: int, -) -> list[list[tuple[int, int]]]: - """Return a mapping of tokens to their (prompt_idx, token_idx) locations in the tokenized_dataset.""" - - mapping = [[] for _ in range(tokenizer_size)] - for prompt_idx, prompt in enumerate(tokenized_dataset): - prompt = cast(dict, prompt) - for position_idx, token in enumerate(prompt["tokens"]): - mapping[token].append((prompt_idx, position_idx)) - return mapping diff --git a/src/delphi/eval/token_positions.py b/src/delphi/eval/token_positions.py deleted file mode 100644 index 5239a53f..00000000 --- a/src/delphi/eval/token_positions.py +++ /dev/null @@ -1,53 +0,0 @@ -from numbers import Number -from typing import Optional, cast - -import torch -from datasets import Dataset -from jaxtyping import Int - -from delphi.eval.utils import dict_filter_quantile - - -def get_all_tok_metrics_in_label( - token_ids: Int[torch.Tensor, "prompt pos"], - token_labels: dict[int, dict[str, bool]], - metrics: torch.Tensor, - label: str, - q_start: Optional[float] = None, - q_end: Optional[float] = None, -) -> dict[tuple[int, int], float]: - """ - From the token_map, get all the positions of the tokens that have a certain label. - We don't use the token_map because for sampling purposes, iterating through token_ids is more efficient. - Optionally, filter the tokens based on the quantile range of the metrics. - - Args: - - token_ids (Dataset): token_ids dataset e.g. token_ids[0] = {"tokens": [[1, 2, ...], [2, 5, ...], ...]} - - token_labels (dict[int, dict[str, bool]]): dictionary of token labels e.g. { 0: {"Is Noun": True, "Is Verb": False}, ...} - - metrics (torch.Tensor): tensor of metrics to search through e.g. torch.tensor([[0.1, 0.2, ...], [0.3, 0.4, ...], ...]) - - label (str): the label to search for - - q_start (float): the start of the quantile range to filter the metrics e.g. 0.1 - - q_end (float): the end of the quantile range to filter the metrics e.g. 0.9 - - Returns: - - tok_positions (dict[tuple[int, int], Number]): dictionary of token positions and their corresponding metrics - """ - - # check if metrics have the same dimensions as token_ids - if metrics.shape != token_ids.shape: - raise ValueError( - f"Expected metrics to have the same shape as token_ids, but got {metrics.shape} and {token_ids.shape} instead." - ) - - tok_positions = {} - for prompt_pos, prompt in enumerate(token_ids.numpy()): - for tok_pos, tok in enumerate(prompt): - if token_labels[tok][label]: - tok_positions[(prompt_pos, tok_pos)] = metrics[ - prompt_pos, tok_pos - ].item() - - if q_start is not None and q_end is not None: - tok_positions = dict_filter_quantile(tok_positions, q_start, q_end) - - return tok_positions diff --git a/src/delphi/eval/utils.py b/src/delphi/eval/utils.py deleted file mode 100644 index faf33757..00000000 --- a/src/delphi/eval/utils.py +++ /dev/null @@ -1,133 +0,0 @@ -import logging -from collections.abc import Callable -from typing import Any, cast - -import numpy as np -import torch -from datasets import Dataset, load_dataset -from jaxtyping import Float, Int -from transformers import PreTrainedModel, PreTrainedTokenizerBase - -from delphi.eval import constants - - -def get_all_logprobs( - model: Callable, input_ids: Int[torch.Tensor, "batch seq"] -) -> Float[torch.Tensor, "batch seq vocab"]: - # batch, seq, vocab - logits = model(input_ids).logits - return torch.log_softmax(logits, dim=-1) - - -# convenience wrapper for calling on a single sample -def get_single_logprobs( - model: Callable, input_ids: Int[torch.Tensor, "seq"] -) -> Float[torch.Tensor, "seq vocab"]: - return get_all_logprobs(model, input_ids.unsqueeze(0))[0] - - -def gather_logprobs( - logprobs: Float[torch.Tensor, "batch seq vocab"], - tokens: Int[torch.Tensor, "batch seq"], -) -> Float[torch.Tensor, "batch seq"]: - return torch.gather(logprobs, -1, tokens.unsqueeze(-1)).squeeze(-1) - - -def get_all_and_next_logprobs( - model: Callable, - input_ids: Int[torch.Tensor, "batch seq"], -) -> tuple[ - Float[torch.Tensor, "batch shorter_seq vocab"], - Float[torch.Tensor, "batch shorter_seq"], -]: - logprobs = get_all_logprobs(model, input_ids[:, :-1]) - next_tokens = input_ids[:, 1:] - return logprobs, gather_logprobs(logprobs, next_tokens) - - -def get_all_and_next_logprobs_single( - model: Callable, - input_ids: Int[torch.Tensor, "seq"], -) -> tuple[ - Float[torch.Tensor, "shorter_seq vocab"], - Float[torch.Tensor, "shorter_seq"], -]: - all_logprobs, next_logprobs = get_all_and_next_logprobs( - model, input_ids.unsqueeze(0) - ) - return all_logprobs[0], next_logprobs[0] - - -def get_next_and_top_k_probs( - model: PreTrainedModel, input_ids: Int[torch.Tensor, "seq"], k: int = 3 -) -> tuple[Float[torch.Tensor, "shorter_seq"], torch.return_types.topk,]: - all_logprobs, next_logprobs = get_all_and_next_logprobs_single(model, input_ids) - all_probs = torch.exp(all_logprobs) - next_probs = torch.exp(next_logprobs) - top_k = torch.topk(all_probs, k, dim=-1) - return next_probs, top_k - - -def load_delphi_dataset(dataset_name: str, split: str, slice: str = "") -> Dataset: - # check that split is either "train" or "validation" - if split not in ["train", "validation"]: - raise ValueError(f"Split must be either 'train' or 'validation', not {split}") - if "/" not in dataset_name: - dataset_name = f"delphi-suite/{dataset_name}" - data_files_str = f"data/{split}-*.parquet" - dataset = load_dataset( - dataset_name, - data_files=data_files_str, - verification_mode="no_checks", - # Currently, load_dataset returns a dataset dict *unless* a split is specified, - # EVEN IF NO SPLIT WITHIN THE DATA FILES SPECIFIED. If there's no split arg, - # huggingface just just says everything is in the "train" split and returns {"train": dataset}. - # In our case the data_files glob already specifies just the validation files, so we - # shouldn't need to specify a split. But we do need to specify a split to get a dataset object, - # or we'd get a Dataset dict. See https://github.com/huggingface/datasets/issues/5189 - split=f"train{slice}", - ) - dataset = cast(Dataset, dataset) - logging.info(f" Loaded {data_files_str} ({len(dataset)} entries)") - return dataset - - -def load_validation_dataset(dataset_name: str, slice: str = "") -> Dataset: - return load_delphi_dataset(dataset_name, "validation", slice) - - -def load_train_dataset(dataset_name: str, slice: str = "") -> Dataset: - return load_delphi_dataset(dataset_name, "train", slice) - - -def tokenize( - tokenizer: PreTrainedTokenizerBase, sample_txt: str -) -> Int[torch.Tensor, "seq"]: - # supposedly this can be different than prepending the bos token id - return cast( - Int[torch.Tensor, "seq"], - tokenizer.encode(tokenizer.bos_token + sample_txt, return_tensors="pt")[0], - ) - - -def load_logprob_dataset(model: str): - return load_dataset(f"transcendingvictor/{model}-validation-logprobs") - - -def load_logprob_datasets(split: str = "validation") -> dict[str, list[list[float]]]: - return { - model: cast(dict, load_logprob_dataset(model)[split])["logprobs"] # type: ignore - for model in constants.LLAMA2_MODELS - } - - -def dict_filter_quantile( - d: dict[Any, float], q_start: float, q_end: float -) -> dict[Any, float]: - if not (0 <= q_start < q_end <= 1): - raise ValueError("Invalid quantile range") - q_start_val = np.nanquantile(list(d.values()), q_start) - q_end_val = np.nanquantile(list(d.values()), q_end) - return { - k: v for k, v in d.items() if q_start_val <= v <= q_end_val and not np.isnan(v) - } diff --git a/src/delphi/eval/vis.py b/src/delphi/eval/vis.py deleted file mode 100644 index 1a69eae2..00000000 --- a/src/delphi/eval/vis.py +++ /dev/null @@ -1,257 +0,0 @@ -import math -import random -import uuid -from typing import cast - -import panel as pn -import torch -from IPython.core.display import HTML -from IPython.core.display_functions import display -from jaxtyping import Float, Int -from transformers import PreTrainedTokenizerBase - - -def probs_to_colors(probs: Float[torch.Tensor, "next_pos"]) -> list[str]: - # for the endoftext token - # no prediction, no color - colors = ["white"] - for p in probs.tolist(): - red_gap = 150 # the higher it is, the less red the tokens will be - green_blue_val = red_gap + int((255 - red_gap) * (1 - p)) - colors.append(f"rgb(255, {green_blue_val}, {green_blue_val})") - return colors - - -def single_loss_diff_to_color(loss_diff: float) -> str: - # if loss_diff is negative, we want the color to be red - # if loss_diff is positive, we want the color to be green - # if loss_diff is 0, we want the color to be white - # the color should be more intense the larger the absolute value of loss_diff - - def sigmoid(x: float) -> float: - return 1 / (1 + math.exp(-x)) - - scaled_loss_diff = sigmoid(loss_diff) # scale to 0-1 - - if scaled_loss_diff < 0.5: # red - red_val = 255 - green_blue_val = min(int(255 * 2 * scaled_loss_diff), 255) - return f"rgb({red_val}, {green_blue_val}, {green_blue_val})" - else: # green - green_val = 255 - red_blue_val = min(int(255 * 2 * (1 - scaled_loss_diff)), 255) - return f"rgb({red_blue_val}, {green_val}, {red_blue_val})" - - -def to_tok_prob_str(tok: int, prob: float, tokenizer: PreTrainedTokenizerBase) -> str: - tok_str = tokenizer.decode(tok).replace(" ", " ").replace("\n", r"\n") - prob_str = f"{prob:.2%}" - return f"{prob_str:>6} |{tok_str}|" - - -def token_to_html( - token: int, - tokenizer: PreTrainedTokenizerBase, - bg_color: str, - data: dict, -) -> str: - data = data or {} # equivalent to if not data: data = {} - # non-breakable space, w/o it leading spaces wouldn't be displayed - str_token = tokenizer.decode(token).replace(" ", " ") - - # background or user-select (for \n) goes here - specific_styles = {} - # for now just adds line break or doesn't - br = "" - - if bg_color: - specific_styles["background-color"] = bg_color - if str_token == "\n": - # replace new line character with two characters: \ and n - str_token = r"\n" - # add line break in html - br += "
" - # this is so we can copy the prompt without "\n"s - specific_styles["user-select"] = "none" - - style_str = data_str = "" - # converting style dict into the style attribute - if specific_styles: - inside_style_str = "; ".join(f"{k}: {v}" for k, v in specific_styles.items()) - style_str = f" style='{inside_style_str}'" - if data: - data_str = "".join( - f" data-{k}='{v.replace(' ', ' ')}'" for k, v in data.items() - ) - return f"
{str_token}
{br}" - - -_token_style = { - "border": "1px solid #888", - "display": "inline-block", - # each character of the same width, so we can easily spot a space - "font-family": "monospace", - "font-size": "14px", - "color": "black", - "background-color": "white", - "margin": "1px 0px 1px 1px", - "padding": "0px 1px 1px 1px", -} -_token_style_str = " ".join([f"{k}: {v};" for k, v in _token_style.items()]) - - -def vis_sample_prediction_probs( - sample_tok: Int[torch.Tensor, "pos"], - correct_probs: Float[torch.Tensor, "pos"], - top_k_probs: torch.return_types.topk, - tokenizer: PreTrainedTokenizerBase, -) -> str: - colors = probs_to_colors(correct_probs) - token_htmls = [] - - # Generate a unique ID for this instance (so we can have multiple instances on the same page) - unique_id = str(uuid.uuid4()) - - token_class = f"token_{unique_id}" - hover_div_id = f"hover_info_{unique_id}" - - for i in range(sample_tok.shape[0]): - tok = cast(int, sample_tok[i].item()) - data = {} - if i > 0: - correct_prob = correct_probs[i - 1].item() - data["next"] = to_tok_prob_str(tok, correct_prob, tokenizer) - top_k_probs_tokens = top_k_probs.indices[i - 1] - top_k_probs_values = top_k_probs.values[i - 1] - for j in range(top_k_probs_tokens.shape[0]): - top_tok = top_k_probs_tokens[j].item() - top_tok = cast(int, top_tok) - top_prob = top_k_probs_values[j].item() - data[f"top{j}"] = to_tok_prob_str(top_tok, top_prob, tokenizer) - - token_htmls.append( - token_to_html(tok, tokenizer, bg_color=colors[i], data=data).replace( - "class='token'", f"class='{token_class}'" - ) - ) - - html_str = f""" - - {"".join(token_htmls)}
- - """ - display(HTML(html_str)) - return html_str - - -def vis_pos_map( - pos_map: dict[tuple[int, int], float | int], - token_ids: Int[torch.Tensor, "prompt pos"], - tokenizer: PreTrainedTokenizerBase, - sample: int = 3, -): - """ - Randomly sample from pos_map and visualize the loss diff at the corresponding position. - """ - - token_htmls = [] - unique_id = str(uuid.uuid4()) - token_class = f"token_{unique_id}" - hover_div_id = f"hover_info_{unique_id}" - - # choose n random keys from pos_map - keys = random.sample(list(pos_map.keys()), k=sample) - - for key in keys: - prompt, pos = key - pre_toks = token_ids[prompt][:pos] - mask = torch.isin(pre_toks, torch.tensor([0, 1], dtype=torch.int8)) - pre_toks = pre_toks[ - ~mask - ] # remove and tokens, cause strikethrough in html - - for i in range(pre_toks.shape[0]): - pre_tok = cast(int, pre_toks[i].item()) - token_htmls.append( - token_to_html(pre_tok, tokenizer, bg_color="white", data={}).replace( - "class='token'", f"class='{token_class}'" - ) - ) - - tok = cast(int, token_ids[prompt][pos].item()) - value = cast(float, pos_map[key]) - - token_htmls.append( - token_to_html( - tok, - tokenizer, - bg_color=single_loss_diff_to_color(value), - data={"loss-diff": f"{value:.2f}"}, - ).replace("class='token'", f"class='{token_class}'") - ) - - # add break line - token_htmls.append("

") - - html_str = f""" - - {"".join(token_htmls)}
- - """ - display(HTML(html_str)) - return html_str - - -def token_selector( - vocab_map: dict[str, int] -) -> tuple[pn.widgets.MultiChoice, list[int]]: - tokens = list(vocab_map.keys()) - token_selector = pn.widgets.MultiChoice(name="Tokens", options=tokens) - token_ids = [vocab_map[token] for token in cast(list[str], token_selector.value)] - - def update_tokens(event): - token_ids.clear() - token_ids.extend([vocab_map[token] for token in event.new]) - - token_selector.param.watch(update_tokens, "value") - return token_selector, token_ids diff --git a/src/delphi/eval/vis_per_token_model.py b/src/delphi/eval/vis_per_token_model.py deleted file mode 100644 index 8daaa96f..00000000 --- a/src/delphi/eval/vis_per_token_model.py +++ /dev/null @@ -1,99 +0,0 @@ -from typing import Union - -import ipywidgets -import numpy as np -import plotly.graph_objects as go - - -def visualize_per_token_category( - input: dict[Union[str, int], dict[str, tuple]], - log_scale=False, - line_metric="Means", - checkpoint_mode=True, - shade_color="rgba(68, 68, 68, 0.3)", - line_color="rgb(31, 119, 180)", - bar_color="purple", - marker_color="SkyBlue", - background_color="AliceBlue", -) -> go.FigureWidget: - input_x = list(input.keys()) - categories = list(input[input_x[0]].keys()) - category = categories[0] - - def get_hovertexts(mid: np.ndarray, lo: np.ndarray, hi: np.ndarray) -> list[str]: - return [f"Loss: {m:.3f} ({l:.3f}, {h:.3f})" for m, l, h in zip(mid, lo, hi)] - - def get_plot_values(category: str) -> tuple[np.ndarray, np.ndarray, np.ndarray]: - x = np.array([input[x][category] for x in input_x]).T - means, err_lo, err_hi = x[0], x[1], x[2] - return means, err_lo, err_hi - - means, err_lo, err_hi = get_plot_values(category) - - if checkpoint_mode: - scatter_plot = go.Figure( - [ - go.Scatter( - name="Upper Bound", - x=input_x, - y=means + err_hi, - mode="lines", - marker=dict(color=shade_color), - line=dict(width=0), - showlegend=False, - ), - go.Scatter( - name="Lower Bound", - x=input_x, - y=means - err_lo, - marker=dict(color=shade_color), - line=dict(width=0), - mode="lines", - fillcolor=shade_color, - fill="tonexty", - showlegend=False, - ), - go.Scatter( - name=line_metric, - x=input_x, - y=means, - mode="lines", - marker=dict( - color=line_color, - size=0, - line=dict(color=line_color, width=1), - ), - ), - ] - ) - else: - scatter_plot = go.Scatter( - x=input_x, - y=means, - error_y=dict( - type="data", - symmetric=False, - array=err_hi, - arrayminus=err_lo, - color=bar_color, - ), - marker=dict( - color=marker_color, - size=15, - line=dict(color=line_color, width=2), - ), - hovertext=get_hovertexts(means, err_lo, err_hi), - hoverinfo="text+x", - ) - g = go.FigureWidget( - data=scatter_plot, - layout=go.Layout( - yaxis=dict( - title="Loss", - type="log" if log_scale else "linear", - ), - plot_bgcolor=background_color, - ), - ) - - return g diff --git a/src/delphi/test_configs/__init__.py b/src/delphi/test_configs/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/delphi/test_configs/debug_transformers_bloom.json b/src/delphi/test_configs/debug_transformers_bloom.json deleted file mode 100644 index 2d72396e..00000000 --- a/src/delphi/test_configs/debug_transformers_bloom.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "max_seq_len": 512, - "max_epochs": 2, - "eval_iters": 1, - "batch_size": 64, - "model_config": { - "model_class": "BloomForCausalLM", - "apply_residual_connection_post_layernorm": false, - "attention_dropout": 0.0, - "bos_token_id": 1, - "eos_token_id": 2, - "hidden_dropout": 0.0, - "hidden_size": 8, - "initializer_range": 0.02, - "layer_norm_epsilon": 1e-05, - "n_head": 2, - "n_layer": 2, - "pretraining_tp": 1, - "slow_but_exact": false, - "use_cache": true, - "vocab_size": 4096 - }, - "batch_ordering_seed": 42, - "torch_seed": 1337, - "dataset": { - "name": "delphi-suite/v0-tinystories-v2-clean-tokenized" - }, - "out_repo_id": "" -} \ No newline at end of file diff --git a/src/delphi/test_configs/v0-llama2-100k.json b/src/delphi/test_configs/v0-llama2-100k.json deleted file mode 100644 index 584b5017..00000000 --- a/src/delphi/test_configs/v0-llama2-100k.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "max_seq_len": 512, - "model_config": { - "model_class": "LlamaForCausalLM", - "attention_bias": false, - "attention_dropout": 0.0, - "bos_token_id": 1, - "eos_token_id": 2, - "hidden_act": "silu", - "hidden_size": 48, - "initializer_range": 0.02, - "intermediate_size": 128, - "max_position_embeddings": 512, - "num_attention_heads": 8, - "num_hidden_layers": 4, - "num_key_value_heads": 4, - "pretraining_tp": 1, - "rms_norm_eps": 1e-05, - "rope_scaling": null, - "rope_theta": 10000.0, - "tie_word_embeddings": true, - "use_cache": true, - "vocab_size": 4096 - }, - "batch_ordering_seed": 42, - "torch_seed": 1337, - "dataset": { - "name": "delphi-suite/v0-tinystories-v2-clean-tokenized" - }, - "out_repo_id": "" -} \ No newline at end of file diff --git a/src/delphi/train/__init__.py b/src/delphi/train/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/delphi/train/config/dataset_config.py b/src/delphi/train/config/dataset_config.py deleted file mode 100644 index 0c1e356e..00000000 --- a/src/delphi/train/config/dataset_config.py +++ /dev/null @@ -1,46 +0,0 @@ -from dataclasses import dataclass, field -from typing import cast - -import datasets -from beartype import beartype -from datasets import Dataset, load_dataset - - -@beartype -@dataclass(frozen=True) -class DatasetConfig: - name: str = field( - metadata={"help": "tokenized dataset on huggingface to use for train"}, - ) - feature: str = field( - default="tokens", - metadata={ - "help": "feature in the train dataset to use for train; should be a list of max_seq_len+1 token ints" - }, - ) - train_split: str = field( - default="train", - metadata={"help": "split of the dataset to use for training"}, - ) - validation_split: str = field( - default="validation", - metadata={"help": "split of the dataset to use for validation"}, - ) - - def _load(self, split) -> Dataset: - ds = load_dataset( - self.name, - split=split, - features=datasets.Features( - {self.feature: datasets.Sequence(datasets.Value("int32"))} - ), - ) - ds = cast(Dataset, ds) - ds.set_format("torch") - return ds - - def load_train(self) -> Dataset: - return self._load(self.train_split) - - def load_validation(self) -> Dataset: - return self._load(self.validation_split) diff --git a/src/delphi/train/config/training_config.py b/src/delphi/train/config/training_config.py deleted file mode 100644 index c68896d4..00000000 --- a/src/delphi/train/config/training_config.py +++ /dev/null @@ -1,106 +0,0 @@ -import os -from dataclasses import dataclass, field -from datetime import datetime -from typing import Any, Optional - -import platformdirs -from beartype import beartype - -from .adam_config import AdamConfig -from .dataset_config import DatasetConfig -from .debug_config import DebugConfig -from .wandb_config import WandbConfig - - -@beartype -@dataclass(frozen=True, kw_only=True) -class TrainingConfig: - model_config: dict[str, Any] = field( - metadata={ - "help": "model config; class_name=name of model class in transformers, everything else is kwargs for the corresponding model config" - }, - ) - max_seq_len: int = field(metadata={"help": "max sequence length"}) - # meta - run_name: str = datetime.now().strftime("%Y_%m_%d_%H_%M_%S") - output_dir: str = field( - default=os.path.join(platformdirs.user_data_dir(appname="delphi"), run_name), - metadata={"help": "output directory"}, - ) - - # device - device: str = field( - default="auto", metadata={"help": "device to use (cuda, mps, cpu)"} - ) - - # checkpoints, logging, eval - checkpoint_interval: int = field( - default=2000, metadata={"help": "checkpoint every N iters"} - ) - extra_checkpoint_iters: list[int] = field( - default_factory=list, - metadata={"help": "manually list iterations to save checkpoints on"}, - ) - log_interval: int = field(default=1, metadata={"help": "log every N iters"}) - eval_iters: int = field(default=100, metadata={"help": "use N iters for each eval"}) - - # resume from checkpoint - resume_from_path: Optional[str] = field( - default=None, - metadata={ - "help": "path to a checkpoint to resume from (if init_from=='resume')" - }, - ) - - # data - batch_size: int = field( - default=64, - metadata={ - "help": "number of samples used to compute the gradient for a single optimizer step" - }, - ) - - # training - max_epochs: int = field( - default=10, metadata={"help": "total number of training epochs"} - ) - grad_clip: float = field( - default=1.0, - metadata={"help": "clip gradients at this value, or disable if == 0.0"}, - ) - gradient_accumulation_steps: int = field( - default=1, - metadata={ - "help": "if > 1 reduces memory usage by computing gradient in microbatches" - }, - ) - # (adamw) optimizer - adam: AdamConfig = field(default_factory=AdamConfig) - - # reproducibility - batch_ordering_seed: int = field( - metadata={"help": "seed used for pseudorandomly sampling data during training"}, - ) - torch_seed: int = field(metadata={"help": "seed used for torch"}) - save_optimizer: bool = True - - # data - dataset: DatasetConfig = field( - metadata={"help": "specify training and validation data"}, - ) - - tokenizer: str = field( - default="", - metadata={ - "help": "HF repo id or local directory containing the tokenizer. Used only to upload it to HF with the model, not for training" - }, - ) - - # third party - wandb: Optional[WandbConfig] = None - out_repo_id: str = field( - metadata={"help": "set to empty string to not push to repo"}, - ) - - # debug - debug_config: DebugConfig = field(default_factory=DebugConfig) diff --git a/src/delphi/train/config/wandb_config.py b/src/delphi/train/config/wandb_config.py deleted file mode 100644 index 9b4e3c55..00000000 --- a/src/delphi/train/config/wandb_config.py +++ /dev/null @@ -1,11 +0,0 @@ -from dataclasses import dataclass - -from beartype import beartype - - -@beartype -@dataclass -class WandbConfig: - project: str - entity: str - silence: bool = False diff --git a/tests/eval/test_compare_models.py b/tests/eval/test_compare_models.py deleted file mode 100644 index 0521b0cb..00000000 --- a/tests/eval/test_compare_models.py +++ /dev/null @@ -1,23 +0,0 @@ -import torch -from transformers import AutoModelForCausalLM, AutoTokenizer - -from delphi.eval.compare_models import NextTokenStats, compare_models -from delphi.eval.utils import load_validation_dataset, tokenize - - -def test_compare_models(): - with torch.set_grad_enabled(False): - model = AutoModelForCausalLM.from_pretrained("roneneldan/TinyStories-1M") - model_instruct = AutoModelForCausalLM.from_pretrained( - "roneneldan/TinyStories-Instruct-1M" - ) - ds_txt = load_validation_dataset("tinystories-v2-clean")["story"] - tokenizer = AutoTokenizer.from_pretrained("roneneldan/TinyStories-1M") - sample_tok = tokenize(tokenizer, ds_txt[0]) - K = 3 - model_comparison = compare_models(model, model_instruct, sample_tok, top_k=K) - # ignore the first element comparison - assert model_comparison[0] is None - assert isinstance(model_comparison[1], NextTokenStats) - assert len(model_comparison) == sample_tok.shape[0] - assert len(model_comparison[1].topk) == K diff --git a/tests/eval/test_spacy_token_labelling.py b/tests/eval/test_spacy_token_labelling.py deleted file mode 100644 index 8a799b95..00000000 --- a/tests/eval/test_spacy_token_labelling.py +++ /dev/null @@ -1,189 +0,0 @@ -import pickle -from pathlib import Path - -import pytest -from spacy.language import Language -from spacy.tokens import Doc -from transformers import AutoTokenizer - -import delphi.eval.spacy_token_labelling as tl - -# skip all tests in this module -pytestmark = pytest.mark.skip( - "tests are slow and we're not using this module currently" -) - -labelled_token_ids_dict: dict[int, dict[str, bool]] = {} - - -@pytest.fixture -def dummy_doc() -> tuple[str, Doc, dict[str, bool]]: - """ - Create a dummy Doc (list of Tokens) with specific attributes for testing purposes. - """ - nlp_dummy = Language() - - # Assume we're creating a dummy token with specific attributes - words = ["Peter", "is", "a", "person"] - spaces = [True, True, True, True] # No space after "dummy_token" - pos_tags = ["PROPN", "AUX", "DET", "NOUN"] # Part-of-speech tag - dep_tags = ["nsubj", "ROOT", "det", "attr"] # Dependency tag - ner_tags = ["PERSON", "", "", ""] # Named entity tag - - # Ensure the length of pos_tags and dep_tags matches the length of words - assert len(words) == len(pos_tags) == len(dep_tags) == len(ner_tags) - - # Create a Doc with one dummy token - doc = Doc(nlp_dummy.vocab, words=words, spaces=spaces) - - # Manually set POS, dependency and NER tags - for token, pos, dep, ner_tag in zip(doc, pos_tags, dep_tags, ner_tags): - token.pos_, token.dep_, token.ent_type_ = pos, dep, ner_tag - - # Token labels for "Peter" in the dummy doc - PETER_TOKEN_LABEL = { - "Starts with space": False, - "Capitalized": True, - "Is Adjective": False, - "Is Adposition": False, - "Is Adverb": False, - "Is Auxiliary": False, - "Is Coordinating conjuction": False, - "Is Determiner": False, - "Is Interjunction": False, - "Is Noun": False, - "Is Numeral": False, - "Is Particle": False, - "Is Pronoun": False, - "Is Proper Noun": True, - "Is Punctuation": False, - "Is Subordinating conjuction": False, - "Is Symbol": False, - "Is Verb": False, - "Is Other": False, - "Is Named Entity": True, - } - text = " ".join(words) - return text, doc, PETER_TOKEN_LABEL - - -def test_explain_token_labels(dummy_doc): - """ - Test the explain_token_labels function. - """ - # explain all labels - tl.explain_token_labels() - # print explanations for the first token in doc - text, doc, PETER_TOKEN_LABEL = dummy_doc - tl.explain_token_labels(doc[0]) - - -def test_label_single_token(dummy_doc): - """ - Test the label_single_token function. - """ - # create a dummy token - text, doc, PETER_TOKEN_LABEL = dummy_doc - token = doc[0] - # label the token - labels = tl.label_single_token(token) - # check if the labels are correct - assert labels == PETER_TOKEN_LABEL - - -def test_label_sentence(dummy_doc): - """ - Test the label_sentence function. - """ - text, doc, PETER_TOKEN_LABEL = dummy_doc - # label the sentence - labels = tl.label_sentence(doc) - # assert the first token is labeled correctly - assert labels[0] == PETER_TOKEN_LABEL - # iterate through tokens in doc - for token, label in zip(doc, labels): - assert label == tl.label_single_token(token) - - -def test_label_batch_sentences(dummy_doc): - """ - Test the label_batch_sentences function. - """ - # create a batch of sentences - text, doc, PETER_TOKEN_LABEL = dummy_doc - text = text.split(" ") - batch = [text, text, text] - # label the batch - labels = tl.label_batch_sentences(batch, tokenized=True) - # assert the first token is labeled correctly - assert labels[0][0] == PETER_TOKEN_LABEL - assert labels[1][0] == PETER_TOKEN_LABEL - assert labels[2][0] == PETER_TOKEN_LABEL - # iterate through tokens in doc - for token, label in zip(doc, labels[0]): - assert label == tl.label_single_token(token) - - -def is_valid_structure(obj: dict[int, dict[str, bool]]) -> bool: - """ - Checks whether the obj fits the structure of `dict[int, dict[str, bool]]`. Returns True, if it fits, False otherwise. - """ - if not isinstance(obj, dict): - print(f"Main structure is not dict! Instead is type {type(obj)}") - return False - for key, value in obj.items(): - if not isinstance(key, int) or not isinstance(value, dict): - print( - f"Main structure is dict, but its keys are either not int or its values are not dicts. Instead key is type {type(key)} and value is type {type(value)}" - ) - return False - for sub_key, sub_value in value.items(): - if not isinstance(sub_key, str) or not isinstance(sub_value, bool): - print( - f"The structure dict[int, dict[X, Y]] is True, but either X is not str or Y is not bool. Instead X is type {type(sub_key)} and Y is type {type(sub_value)}" - ) - return False - return True - - -def test_label_tokens_from_tokenizer(): - """ - Simple test, checking if download of tokinzer and the labelling of all tokens in its vocabulary works. - """ - global labelled_token_ids_dict - # get a tokinzer - model_name = "delphi-suite/delphi-llama2-100k" - tokenizer = AutoTokenizer.from_pretrained(model_name) - vocab_size = tokenizer.vocab_size - - tokens_str, labelled_token_ids_dict = tl.label_tokens_from_tokenizer(tokenizer) - # count the number of lines in the token_str - assert tokens_str.count("\n") == (vocab_size + 1) # + 1, because of token '\n' - assert len(labelled_token_ids_dict.keys()) == vocab_size - assert is_valid_structure(labelled_token_ids_dict) == True - - -@pytest.mark.parametrize("path", [Path("temp/token_labels.csv")]) -def test_import_token_labels(path: Path): - """ - Simple test, checking if the import of token labels works. - - Note: Because we want to use pure pytest and not install any extra dependencies (e.g. pytest-depencency) we recreate the `labelled_tokens_dict` in this test as we did in `test_label_tokens_from_tokenizer`. This duplication is not ideal, but it is the best quick&dirty solution for now. - """ - # create the labelled_token_ids_dict - model_name = "delphi-suite/delphi-llama2-100k" - tokenizer = AutoTokenizer.from_pretrained(model_name) - _, labelled_token_ids_dict = tl.label_tokens_from_tokenizer(tokenizer) - - # create the path - path.parent.mkdir(parents=True, exist_ok=True) - # save the file - df = tl.convert_label_dict_to_df(labelled_token_ids_dict) - df.to_csv(path, index=False) - - # load the file with our function to be tested - loaded_dict = tl.import_token_labels(path) - - # assure that the structure is correct - assert loaded_dict == labelled_token_ids_dict - assert is_valid_structure(loaded_dict) == True diff --git a/tests/eval/test_token_map.py b/tests/eval/test_token_map.py deleted file mode 100644 index 2f896326..00000000 --- a/tests/eval/test_token_map.py +++ /dev/null @@ -1,47 +0,0 @@ -import pytest -from datasets import Dataset - -from delphi.eval.token_map import token_map - - -def test_token_map(): - tokenized_dataset = Dataset.from_dict( - { - "tokens": [ - [0, 1, 2, 3, 4, 5, 0, 6, 7], - [0, 1, 2, 3, 4, 5, 0, 6, 7], - [0, 1, 2, 3, 4, 5, 0, 6, 7], - ] - } - ) - assert token_map(tokenized_dataset, tokenizer_size=9) == [ - [(0, 0), (0, 6), (1, 0), (1, 6), (2, 0), (2, 6)], - [(0, 1), (1, 1), (2, 1)], - [(0, 2), (1, 2), (2, 2)], - [(0, 3), (1, 3), (2, 3)], - [(0, 4), (1, 4), (2, 4)], - [(0, 5), (1, 5), (2, 5)], - [(0, 7), (1, 7), (2, 7)], - [(0, 8), (1, 8), (2, 8)], - [], # token 8 is not present in the dataset - ] - - # fmt: off - tokenized_dataset = Dataset.from_dict( - { # one really long prompt - "tokens": [ - [0, 1, 2, 3, 4, 5, 0, 6, 7, 0, 1, 2, 3, 4, 5, 0, 6, 7, 0, 1, 2, 3, 4, 5, 0, 6, 7] - ] - } - ) - # fmt: on - assert token_map(tokenized_dataset, tokenizer_size=8) == [ - [(0, 0), (0, 6), (0, 9), (0, 15), (0, 18), (0, 24)], - [(0, 1), (0, 10), (0, 19)], - [(0, 2), (0, 11), (0, 20)], - [(0, 3), (0, 12), (0, 21)], - [(0, 4), (0, 13), (0, 22)], - [(0, 5), (0, 14), (0, 23)], - [(0, 7), (0, 16), (0, 25)], - [(0, 8), (0, 17), (0, 26)], - ] diff --git a/tests/eval/test_token_positions.py b/tests/eval/test_token_positions.py deleted file mode 100644 index 1adef6b7..00000000 --- a/tests/eval/test_token_positions.py +++ /dev/null @@ -1,51 +0,0 @@ -from math import isclose -from typing import cast - -import pytest -from datasets import Dataset - -from delphi.eval.token_positions import * - - -@pytest.fixture -def mock_data(): - token_ids = Dataset.from_dict( - {"tokens": [[1, 2, 3], [4, 5, 6], [7, 8, 9]]} - ).with_format("torch") - token_labels = { - 1: {"Is Noun": False, "Is Verb": True}, - 2: {"Is Noun": True, "Is Verb": True}, - 3: {"Is Noun": False, "Is Verb": False}, - 4: {"Is Noun": True, "Is Verb": False}, - 5: {"Is Noun": False, "Is Verb": True}, - 6: {"Is Noun": True, "Is Verb": True}, - 7: {"Is Noun": False, "Is Verb": False}, - 8: {"Is Noun": True, "Is Verb": False}, - 9: {"Is Noun": False, "Is Verb": True}, - } - metrics = torch.tensor([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6], [0.7, 0.8, 0.9]]) - return token_ids, token_labels, metrics - - -def test_get_all_tok_metrics_in_label(mock_data): - token_ids, token_labels, metrics = mock_data - result = get_all_tok_metrics_in_label( - token_ids["tokens"], token_labels, metrics, "Is Noun" - ) - expected = { - (0, 1): 0.2, - (1, 0): 0.4, - (1, 2): 0.6, - (2, 1): 0.8, - } - # use isclose to compare floating point numbers - for k in result: - assert isclose(cast(float, result[k]), expected[k], rel_tol=1e-6) # type: ignore - - # test with quantile filtering - result_q = get_all_tok_metrics_in_label( - token_ids["tokens"], token_labels, metrics, "Is Noun", q_start=0.3, q_end=1.0 - ) - expected_q = {(1, 2): 0.6, (2, 1): 0.8, (1, 0): 0.4} - for k in result_q: - assert isclose(cast(float, result_q[k]), expected_q[k], rel_tol=1e-6) # type: ignore diff --git a/tests/eval/test_utils_eval.py b/tests/eval/test_utils_eval.py deleted file mode 100644 index ad0f54b8..00000000 --- a/tests/eval/test_utils_eval.py +++ /dev/null @@ -1,78 +0,0 @@ -from math import isclose - -import pytest -import torch - -from delphi.eval.utils import ( - dict_filter_quantile, - gather_logprobs, - load_validation_dataset, -) - - -def test_gather_logprobs(): - # vocab size = 3 - logprobs = torch.tensor( - [ - # batch 0 - [ - # seq 0 - [0.00, 0.01, 0.02], - # seq 1 - [0.10, 0.11, 0.12], - ], - # batch 1 - [ - # seq 0 - [1.00, 1.01, 1.02], - # seq 1 - [1.10, 1.11, 1.12], - ], - ] - ) - tokens = torch.tensor( - [ - # batch 0 - [0, 2], - # batch 1 - [1, 2], - ] - ) - expected_output = torch.tensor( - [ - # batch 0 - [0.00, 0.12], - # batch 1 - [1.01, 1.12], - ] - ) - result = gather_logprobs(logprobs, tokens) - assert torch.allclose(result, expected_output) - - -def test_load_validation_dataset(): - text = load_validation_dataset("tinystories-v2-clean") - tokenized = load_validation_dataset("tinystories-v2-clean-tokenized-v0") - - -@pytest.mark.filterwarnings( - "ignore::RuntimeWarning" -) # ignore warnings from numpy empty slice -def test_dict_filter_quantile(): - d = {1: 0.1, 2: 0.2, 3: 0.3, 4: 0.4, 5: 0.5} - result = dict_filter_quantile(d, 0.2, 0.6) - expected = {2: 0.2, 3: 0.3, 4: 0.4} - for k in result: - assert isclose(result[k], expected[k], rel_tol=1e-6) - - # test invalid quantile range - with pytest.raises(ValueError): - dict_filter_quantile(d, 0.6, 0.2) - with pytest.raises(ValueError): - dict_filter_quantile(d, 0.1, 1.1) - with pytest.raises(ValueError): - dict_filter_quantile(d, -0.1, 0.6) - - # test empty dict, will raise a warning - result = dict_filter_quantile({}, 0.2, 0.6) - assert result == {} diff --git a/tests/test_dummy.py b/tests/test_dummy.py deleted file mode 100644 index 88261b12..00000000 --- a/tests/test_dummy.py +++ /dev/null @@ -1,15 +0,0 @@ -import pytest -import torch -from beartype.roar import BeartypeCallHintViolation - -from delphi.dummy import dummy - - -def test_dummy(): - tensor1 = torch.tensor([1.0, 2.0, 3.0]) - tensor2 = torch.tensor([[1, 2, 3], [4, 5, 6]]) - assert torch.allclose(dummy(tensor1), torch.tensor([2.0, 3.0, 4.0])) - assert torch.allclose(dummy(tensor2), torch.tensor([0.9, 1.9, 2.9])) - tensor3 = torch.tensor([1, 2, 3]) - with pytest.raises(BeartypeCallHintViolation): - dummy(tensor3) diff --git a/tests/test_eval.py b/tests/test_eval.py new file mode 100644 index 00000000..cdf88413 --- /dev/null +++ b/tests/test_eval.py @@ -0,0 +1,91 @@ +from math import isclose +from typing import cast + +import pytest +import torch +from datasets import Dataset + +from delphi.eval import dict_filter_quantile, get_all_tok_metrics_in_label + + +@pytest.mark.filterwarnings( + "ignore::RuntimeWarning" +) # ignore warnings from numpy empty slice +def test_dict_filter_quantile(): + d = {1: 0.1, 2: 0.2, 3: 0.3, 4: 0.4, 5: 0.5} + result = dict_filter_quantile(d, 0.2, 0.6) + expected = {2: 0.2, 3: 0.3} + + # compare keys + assert result.keys() == expected.keys() + # compare values + for k in result: + assert isclose(result[k], expected[k], rel_tol=1e-6) + + # test with negative values + d = {1: -0.1, 2: -0.2, 3: -0.3, 4: -0.4, 5: -0.5} + result = dict_filter_quantile(d, 0.2, 0.6) + expected = {3: -0.3, 4: -0.4} + + # compare keys + assert result.keys() == expected.keys() + # compare values + for k in result: + assert isclose(result[k], expected[k], rel_tol=1e-6) + + # test invalid quantile range + with pytest.raises(ValueError): + dict_filter_quantile(d, 0.6, 0.2) + with pytest.raises(ValueError): + dict_filter_quantile(d, 0.1, 1.1) + with pytest.raises(ValueError): + dict_filter_quantile(d, -0.1, 0.6) + + # test empty dict, will raise a warning + result = dict_filter_quantile({}, 0.2, 0.6) + assert result == {} + + +def test_get_all_tok_metrics_in_label(): + token_ids = Dataset.from_dict( + {"tokens": [[1, 2, 3], [4, 5, 6], [7, 8, 9]]} + ).with_format("torch") + selected_tokens = [2, 4, 6, 8] + metrics = torch.tensor([[-1, 0.45, -0.33], [-1.31, 2.3, 0.6], [0.2, 0.8, 0.1]]) + result = get_all_tok_metrics_in_label( + token_ids["tokens"], # type: ignore + selected_tokens, + metrics, + ) + # key: (prompt_pos, tok_pos), value: logprob + expected = { + (0, 1): 0.45, + (1, 0): -1.31, + (1, 2): 0.6, + (2, 1): 0.8, + } + + # compare keys + assert result.keys() == expected.keys() + # compare values + for k in result: + assert isclose(cast(float, result[k]), expected[k], rel_tol=1e-6) # type: ignore + + # test with quantile filtering + result_q = get_all_tok_metrics_in_label( + token_ids["tokens"], # type: ignore + selected_tokens, + metrics, + q_start=0.6, + q_end=1.0, + ) + expected_q = { + (1, 2): 0.6, + (2, 1): 0.8, + } + + # compare keys + assert result_q.keys() == expected_q.keys() + # compare values + for k in result_q: + assert isclose(cast(float, result_q[k]), expected_q[k], rel_tol=1e-6) # type: ignore diff --git a/tests/dataset/test_tokeniation.py b/tests/test_tokeniation.py similarity index 97% rename from tests/dataset/test_tokeniation.py rename to tests/test_tokeniation.py index bb4180ba..cc9494b2 100644 --- a/tests/dataset/test_tokeniation.py +++ b/tests/test_tokeniation.py @@ -5,7 +5,7 @@ from datasets import Dataset from transformers import AutoTokenizer -from delphi.dataset.tokenization import extend_deque, make_new_sample, tokenize_dataset +from delphi.tokenization import extend_deque, make_new_sample, tokenize_dataset @pytest.fixture diff --git a/tests/test_utils.py b/tests/test_utils.py index 597438ca..79b639ad 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,6 +1,13 @@ -from delphi.utils import hf_split_to_split_name +import random +import string -from .utils import random_string +import torch + +from delphi.utils import gather_logprobs, hf_split_to_split_name + + +def random_string(length: int) -> str: + return "".join(random.choices(string.ascii_lowercase, k=length)) def test_hf_split_to_split_name(): @@ -12,3 +19,43 @@ def test_hf_split_to_split_name(): assert hf_split_to_split_name(f"{random_split_name}[:200]") == random_split_name assert hf_split_to_split_name(f"{random_split_name}[200:]") == random_split_name assert hf_split_to_split_name(f"{random_split_name}[200:400]") == random_split_name + + +def test_gather_logprobs(): + # vocab size = 3 + logprobs = torch.tensor( + [ + # batch 0 + [ + # seq 0 + [0.00, 0.01, 0.02], + # seq 1 + [0.10, 0.11, 0.12], + ], + # batch 1 + [ + # seq 0 + [1.00, 1.01, 1.02], + # seq 1 + [1.10, 1.11, 1.12], + ], + ] + ) + tokens = torch.tensor( + [ + # batch 0 + [0, 2], + # batch 1 + [1, 2], + ] + ) + expected_output = torch.tensor( + [ + # batch 0 + [0.00, 0.12], + # batch 1 + [1.01, 1.12], + ] + ) + result = gather_logprobs(logprobs, tokens) + assert torch.allclose(result, expected_output) diff --git a/tests/train/config/test_config_utils.py b/tests/train/config/test_config_utils.py index 710aa404..b67791c6 100644 --- a/tests/train/config/test_config_utils.py +++ b/tests/train/config/test_config_utils.py @@ -1,8 +1,6 @@ from typing import Optional -import pytest - -from delphi.constants import TEST_CONFIGS_DIR +from delphi import TEST_CONFIGS_DIR from delphi.train.config.utils import ( _unoptionalize, build_config_from_files_and_overrides, @@ -48,7 +46,7 @@ def test_build_config_from_files_and_overrides(): assert config.eval_iters == 5 # check base values assert config.max_epochs == 2 - assert config.dataset.name == "delphi-suite/v0-tinystories-v2-clean-tokenized" + assert config.dataset.path == "delphi-suite/stories-tokenized" def test_unoptionalize(): diff --git a/tests/train/test_train_step.py b/tests/train/test_train_step.py index 1a7db8cb..e06fa1af 100644 --- a/tests/train/test_train_step.py +++ b/tests/train/test_train_step.py @@ -7,8 +7,7 @@ from jaxtyping import Float from transformers import PreTrainedModel -from delphi.constants import TEST_CONFIGS_DIR -from delphi.eval.utils import get_all_and_next_logprobs +from delphi import TEST_CONFIGS_DIR from delphi.train.config import TrainingConfig from delphi.train.config.utils import build_config_from_files_and_overrides from delphi.train.train_step import accumulate_gradients, train_step @@ -18,6 +17,7 @@ init_model, setup_determinism, ) +from delphi.utils import get_all_and_next_logprobs def load_test_config(preset_name: str) -> TrainingConfig: diff --git a/tests/train/test_wandb_utils.py b/tests/train/test_wandb_utils.py index 4ca89670..70179304 100644 --- a/tests/train/test_wandb_utils.py +++ b/tests/train/test_wandb_utils.py @@ -7,11 +7,11 @@ import transformers from dacite import from_dict -from delphi.constants import TEST_CONFIGS_DIR +from delphi import TEST_CONFIGS_DIR from delphi.train.config import TrainingConfig from delphi.train.config.utils import build_config_from_files_and_overrides from delphi.train.utils import ModelTrainingState, initialize_model_training_state -from delphi.train.wandb_utils import init_wandb, log_to_wandb, silence_wandb +from delphi.train.wandb_utils import init_wandb, log_to_wandb @pytest.fixture @@ -19,10 +19,7 @@ def mock_training_config() -> TrainingConfig: preset_path = TEST_CONFIGS_DIR / "debug.json" overrides = { "run_name": "test_run", - "wandb": { - "entity": "test_entity", - "project": "test_project", - }, + "wandb": "test_entity/test_project", } return build_config_from_files_and_overrides([preset_path], overrides) @@ -39,12 +36,6 @@ def mock_model_training_state(mock_training_config): return mts -@patch.dict("os.environ", {}, clear=True) -def test_silence_wandb(): - silence_wandb() - assert os.environ["WANDB_SILENT"] == "true" - - @patch("wandb.init") def test_init_wandb(mock_wandb_init: MagicMock, mock_training_config): init_wandb(mock_training_config) diff --git a/tests/utils.py b/tests/utils.py deleted file mode 100644 index ed81b58a..00000000 --- a/tests/utils.py +++ /dev/null @@ -1,6 +0,0 @@ -import random -import string - - -def random_string(length: int) -> str: - return "".join(random.choices(string.ascii_lowercase, k=length))