Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add token labelling #21

Merged
merged 29 commits into from
Feb 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
c8e516b
add token labelling
joshuawe Feb 1, 2024
f57210c
add explanation function
joshuawe Feb 2, 2024
32d498a
add notebook
joshuawe Feb 2, 2024
e9f5c11
test
joshuawe Feb 8, 2024
c10d5c1
swtich off dependency labels + add spacy to requirements
joshuawe Feb 8, 2024
ab3be19
small improvements
joshuawe Feb 8, 2024
3c08947
improve notebook explanation
joshuawe Feb 8, 2024
6bf1c56
fix errors
joshuawe Feb 9, 2024
70c337d
add notebook
joshuawe Feb 2, 2024
1fcdd35
test
joshuawe Feb 8, 2024
bd3be77
swtich off dependency labels + add spacy to requirements
joshuawe Feb 8, 2024
e3013db
small improvements
joshuawe Feb 8, 2024
90f2dbb
improve notebook explanation
joshuawe Feb 8, 2024
57689f4
fix errors
joshuawe Feb 9, 2024
197364d
complete UPOS tags for token labels
joshuawe Feb 12, 2024
cdef0d6
add tests
joshuawe Feb 13, 2024
2c49e2e
update requirements for delphi tokenizer
joshuawe Feb 13, 2024
535a0c0
added token label script
joshuawe Feb 13, 2024
48f7f6a
add the files containing token information/labels
joshuawe Feb 13, 2024
210a3da
small enhancements suggested for PR
joshuawe Feb 14, 2024
6a4a42d
rebasing
joshuawe Feb 14, 2024
fcf4ba6
improve optional downloading of spacy language model
joshuawe Feb 15, 2024
f234ec5
bugfix: handle tokens empty string ''
joshuawe Feb 15, 2024
4047be4
add argparse for label_all_tokens.py script
joshuawe Feb 15, 2024
3c4a1a4
add tokenized dicts
joshuawe Feb 15, 2024
87e18b3
update notebook
joshuawe Feb 15, 2024
ef0f2e4
undo __init__
joshuawe Feb 15, 2024
5af8a6f
change spacy model from "trf" to "sm"
joshuawe Feb 16, 2024
c292da7
bug fix
joshuawe Feb 16, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
435 changes: 435 additions & 0 deletions notebooks/token_labelling.ipynb

Large diffs are not rendered by default.

6 changes: 5 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,8 @@ black==23.12.1
jaxtyping==0.2.25
beartype==0.16.4
pre-commit==3.6.0
isort==5.13.2
isort==5.13.2
spacy==3.7.2
chardet==5.2.0
sentencepiece==0.1.99
protobuf==4.25.2
110 changes: 110 additions & 0 deletions scripts/label_all_tokens.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
import argparse
import pickle
from pathlib import Path

from tqdm.auto import tqdm
from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast

from delphi.eval import token_labelling


def tokenize(
tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast, sample_txt: str
) -> int:
# supposedly this can be different than prepending the bos token id
return tokenizer.encode(tokenizer.bos_token + sample_txt, return_tensors="pt")[0]


# Decode a sentence
def decode(
tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast, token_ids: int | list[int]
) -> str:
return tokenizer.decode(token_ids, skip_special_tokens=True)


def main():
# Setup argparse
parser = argparse.ArgumentParser(description="Tokenization and labeling utility.")
parser.add_argument(
"--model_name",
type=str,
help="Name of the model to use for tokenization and labeling.",
default="delphi-suite/delphi-llama2-100k",
required=False,
)
args = parser.parse_args()

# Access command-line arguments
# Directory to save the results
SAVE_DIR = Path("src/delphi/eval/")
model_name = args.model_name

print("\n", " LABEL ALL TOKENS ".center(50, "="), "\n")
print(f"You chose the model: {model_name}\n")
print(
f"The language model will be loaded from Huggingface and its tokenizer used to do two things:\n\t1) Create a list of all tokens in the tokenizer's vocabulary.\n\t2) Label each token with its part of speech, dependency, and named entity recognition tags.\nThe respective results will be saved to files located at: '{SAVE_DIR}'\n"
)

# ================ (1) =================
print("(1) Create a list of all tokens in the tokenizer's vocabulary ...")

# Load the tokenizer from Huggingface
tokenizer = AutoTokenizer.from_pretrained(model_name)
joshuawe marked this conversation as resolved.
Show resolved Hide resolved
vocab_size = tokenizer.vocab_size
print("Loaded the tokenizer.\nThe vocab size is:", vocab_size)

# Create a list of all tokens in the tokenizer's vocabulary
tokens_str = "" # will hold all tokens and their ids
for i in range(tokenizer.vocab_size):
tokens_str += f"{i},{decode(tokenizer, i)}\n"

# Save the list of all tokens to a file
filename = "all_tokens_list.txt"
filepath = SAVE_DIR / filename
with open(filepath, "w", encoding="utf-8") as f:
f.write(tokens_str)

print(f"Saved the list of all tokens to:\n\t{filepath}\n")

# ================ (2) =================
print("(2) Label each token ...")

# let's label each token
labelled_token_ids_dict: dict[int, dict[str, bool]] = {} # token_id: labels
max_token_id = tokenizer.vocab_size # stop at which token id, vocab size
# we iterate over all token_ids individually
for token_id in tqdm(range(0, max_token_id), desc="Labelling tokens"):
# decode the token_ids to get a list of tokens, a 'sentence'
tokens = decode(tokenizer, token_id) # list of tokens == sentence
# put the sentence into a list, to make it a batch of sentences
sentences = [tokens]
# label the batch of sentences
labels = token_labelling.label_batch_sentences(
sentences, tokenized=True, verbose=False
)
# create a dict with the token_ids and their labels
# update the labelled_token_ids_dict with the new dict
labelled_token_ids_dict[token_id] = labels[0][0]

# Save the labelled tokens to a file
filename = "labelled_token_ids_dict.pkl"
filepath = SAVE_DIR / filename
with open(filepath, "wb") as f:
pickle.dump(labelled_token_ids_dict, f)

print(f"Saved the labelled tokens to:\n\t{filepath}\n")

# sanity check that The pickled and the original dict are the same
print("Sanity check ...", end="")
# load pickle
with open(filepath, "rb") as f:
pickled = pickle.load(f)
# compare
assert labelled_token_ids_dict == pickled
print(" completed.")

print(" END ".center(50, "="))


if __name__ == "__main__":
main()
Binary file added src/delphi/eval/all_tokens_list.txt
Binary file not shown.
Binary file added src/delphi/eval/labelled_token_ids_dict.pkl
Binary file not shown.
210 changes: 210 additions & 0 deletions src/delphi/eval/token_labelling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
from typing import Callable, Optional

import spacy
from spacy.tokens import Doc, Token
from spacy.util import is_package

# make sure the english language model capabilities are installed by the equivalent of:
# python -m spacy download en_core_web_sm
# Should be run once, initially. Download only starts if not already installed.
SPACY_MODEL = "en_core_web_sm" # small: "en_core_web_sm", large: "en_core_web_trf"
NLP = None # global var to hold the language model
if not is_package(SPACY_MODEL):
spacy.cli.download(SPACY_MODEL, False, False)


TOKEN_LABELS: dict[str, Callable] = {
# --- custom categories ---
"Starts with space": (lambda token: token.text.startswith(" ")), # bool
"Capitalized": (lambda token: token.text[0].isupper()), # bool
# --- POS (part-of-speech) categories ---
# They include the Universal POS tags (https://universaldependencies.org/u/pos/)
# -> "POS Tag": (lambda token: token.pos_), # 'NOUN', 'VB', ..
"Is Adjective": (lambda token: token.pos_ == "ADJ"),
"Is Adposition": (lambda token: token.pos_ == "ADP"),
"Is Adverb": (lambda token: token.pos_ == "ADV"),
"Is Auxiliary": (lambda token: token.pos_ == "AUX"),
"Is Coordinating conjuction": (lambda token: token.pos_ == "CCONJ"),
"Is Determiner": (lambda token: token.pos_ == "DET"),
"Is Interjunction": (lambda token: token.pos_ == "INTJ"),
"Is Noun": (lambda token: token.pos_ == "NOUN"),
"Is Numeral": (lambda token: token.pos_ == "NUM"),
"Is Particle": (lambda token: token.pos_ == "PART"),
"Is Pronoun": (lambda token: token.pos_ == "PRON"),
"Is Proper Noun": (lambda token: token.pos_ == "PROPN"),
"Is Punctuation": (lambda token: token.pos_ == "PUNCT"),
"Is Subordinating conjuction": (lambda token: token.pos_ == "SCONJ"),
"Is Symbol": (lambda token: token.pos_ == "SYM"),
"Is Verb": (lambda token: token.pos_ == "VERB"),
"Is Other": (lambda token: token.pos_ == "X"),
# --- dependency categories ---
# -> "Dependency": (lambda token: token.dep_), # 'nsubj', 'ROOT', 'dobj', ..
# "Is Subject": (lambda token: token.dep_ == "nsubj"),
# "Is Object": (lambda token: token.dep_ == "dobj"),
# "Is Root": (
# lambda token: token.dep_ == "ROOT"
# ), # root of the sentence (often a verb)
# "Is auxiliary": (lambda token: token.dep_ == "aux"),
# --- Named entity recognition (NER) categories ---
# "Named Entity Type": (lambda token: token.ent_type_), # '', 'PERSON', 'ORG', 'GPE', ..
"Is Named Entity": (lambda token: token.ent_type_ != ""),
}


def explain_token_labels(token: Optional[Token] = None) -> None:
"""
Prints the explanation of a specific token's labels or of ALL
possible labels (POS, dependency, NER, ...), if no token is provided.

Parameters
----------
token : Optional[Token], optional
The token, whose labels should be explained. If None, all labels
possible labels are explained, by default None.
"""
if token is not None:
# get token labels
labels = label_single_token(token)
print(" Explanation of token labels ".center(45, "-"))
print("Token text:".ljust(20), token.text)
print("Token dependency:".ljust(20), spacy.glossary.explain(token.dep_))
print("Token POS:".ljust(20), spacy.glossary.explain(token.pos_))
print(" Token labels ".center(45, "-"))
for i, (label_name, value) in enumerate(labels.items()):
print(f" {i:2} ", label_name.ljust(20), value)

else:
glossary = spacy.glossary.GLOSSARY
print(
f"Explanation of all {len(glossary.keys())} token labels (POS, dependency, NER, ...):"
)
for label, key in glossary.items():
print(" ", label.ljust(10), key)


def label_single_token(token: Token | None) -> dict[str, bool]:
"""
Labels a single token. A token, that has been analyzed by the spaCy
library.

Parameters
----------
token : Token | None
The token to be labelled.

Returns
-------
dict[str, bool]
Returns a dictionary with the token's labels as keys and their
corresponding boolean values.
"""
labels = dict() # The dict holding labels of a single token
# if token is None, then it is a '' empty strong token or similar
if token is None:
for label_name, category_check in TOKEN_LABELS.items():
labels[label_name] = False
labels["Is Other"] = True
return labels
# all other cases / normal tokens
for label_name, category_check in TOKEN_LABELS.items():
labels[label_name] = category_check(token)
return labels


def label_sentence(tokens: Doc | list[Token]) -> list[dict[str, bool]]:
"""
Labels spaCy Tokens in a sentence. Takes the context of the token into account
for dependency labels (e.g. subject, object, ...), IF dependency labels are turned on.

Parameters
----------
tokens : list[Token]
A list of tokens.

Returns
-------
list[dict[str, bool]]
Returns a list of the tokens' labels.
"""
labelled_tokens = list() # list holding labels for all tokens of sentence
# if the list is empty it is because token is '' empty string or similar
if len(tokens) == 0:
labels = label_single_token(None)
labelled_tokens.append(labels)
return labelled_tokens
# in all other cases
for token in tokens:
labels = label_single_token(token)
labelled_tokens.append(labels)
return labelled_tokens


def label_batch_sentences(
sentences: list[str] | list[list[str]],
tokenized: bool = True,
verbose: bool = False,
) -> list[list[dict[str, bool]]]:
"""
Labels tokens in a sentence batchwise. Takes the context of the token into
account for dependency labels (e.g. subject, object, ...).

Parameters
----------
sentences : list
A batch/list of sentences, each being a list of tokens.
tokenized : bool, optional
Whether the sentences are already tokenized, by default True. If the sentences
are full strings and not lists of tokens, then set to False. If true then `sentences` must be list[list[str]].
verbose : bool, optional
Whether to print the tokens and their labels to the console, by default False.

Returns
-------
list[list[dict[str, bool]]
Returns a list of sentences. Each sentence contains a list of its
corresponding token length where each entry provides the labels/categories
for the token. Sentence -> Token -> Labels
"""
Comment on lines +147 to +167
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My preferred docstring template is

"""Short one-line description

Optional longer description
"""

If you want to list and describe all arguments that's fine, but specifying their type and optionality is redundant, you can see all of that in function definition.

global NLP, SPACY_MODEL

if NLP is None:
# Load english language model
NLP = spacy.load(SPACY_MODEL)
# labelled tokens, list holding sentences holding tokens holding corresponding token labels
labelled_sentences: list[list[dict[str, bool]]] = list()

# go through each sentence in the batch
for sentence in sentences:
if tokenized:
# sentence is a list of tokens
doc = Doc(NLP.vocab, words=sentence) # type: ignore
# Apply the spaCy pipeline, except for the tokenizer
for name, proc in NLP.pipeline:
if name != "tokenizer":
doc = proc(doc)
else:
# sentence is a single string
doc = NLP(sentence) # type: ignore

labelled_tokens = list() # list holding labels for all tokens of sentence
labelled_tokens = label_sentence(doc)

# print the token and its labels to console
if verbose is True:
# go through each token in the sentence
for token, labelled_token in zip(doc, labelled_tokens):
print(f"Token: {token}")
print(" | ".join(list(TOKEN_LABELS.keys())))
printable = [
str(l).ljust(len(name)) for name, l in labelled_token.items()
]
printable = " | ".join(printable)
print(printable)
print("---")
# add current sentence's tokens' labels to the list
labelled_sentences.append(labelled_tokens)

if verbose is True:
print("\n")

return labelled_sentences
Loading
Loading