Skip to content
This repository has been archived by the owner on Nov 13, 2024. It is now read-only.

Tokenizer docstrings #131

Merged
merged 8 commits into from
Nov 2, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 60 additions & 1 deletion src/canopy/tokenizer/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,30 +5,89 @@


class OpenAITokenizer(BaseTokenizer):
"""
Tokenizer for OpenAI models, based on the tiktoken library.

Usage:
Initialize the singleton tokenizer with the OpenAITokenizer class:
>>> from canopy.tokenizer import Tokenizer
>>> Tokenizer.initialize(tokenizer_class=OpenAITokenizer, model_name="gpt-3.5-turbo")

You can then use the tokenizer instance from anywhere in the code:
>>> from canopy.tokenizer import Tokenizer
>>> tokenizer = Tokenizer()
>>> tokenizer.tokenize("Hello world!")
['Hello', ' world', '!']
""" # noqa: E501

MESSAGE_TOKENS_OVERHEAD = 3
FIXED_PREFIX_TOKENS = 3

def __init__(self, model_name: str = "gpt-3.5-turbo"):
"""
Initialize the tokenizer.

Args:
model_name: The name of the model to use. Defaults to "gpt-3.5-turbo".
You can find the list of available models here: https://github.com/openai/tiktoken/blob/39f29cecdb6fc38d9a3434e5dd15e4de58cf3c80/tiktoken/model.py#L19C1-L19C18
As you can see, both gpt-3.5 and gpt-4 are using the same cl100k_base tokenizer.
""" # noqa: E501
self._encoder = tiktoken.encoding_for_model(model_name)

def tokenize(self, text: str) -> List[str]:
"""
Tokenize a text using tiktoken.

Args:
text: The text to tokenize.

Returns:
The list of tokens.
"""
return [self._encoder.decode([encoded_token])
for encoded_token in self._encode(text)]

def detokenize(self, tokens: List[str]) -> str:
"""
Detokenize a list of tokens that were previously tokenized using this tokenizer.

Args:
tokens: The list of tokens to detokenize.

Returns:
The detokenized text as a string.
"""
if not isinstance(tokens, List):
raise TypeError(f"detokenize expect List[str], got f{type(tokens)}")
return "".join(tokens)

def token_count(self, text: str) -> int:
"""
Count the number of tokens in a text.

Args:
text: The text to count the tokens of.

Returns:
The number of tokens in the text.
"""
return len(self._encode(text))

def _encode(self, text):
return self._encoder.encode(text, disallowed_special=())

def messages_token_count(self, messages: Messages) -> int:
# Adapted from: https://github.com/openai/openai-cookbook/.../How_to_format_inputs_to_ChatGPT_models.ipynb # noqa
"""
Count the number of tokens in a list of messages as expected to be counted by OpenAI models.
Account for the overhead of the messages structure.
Taken from: https://github.com/openai/openai-cookbook/.../How_to_format_inputs_to_ChatGPT_models.ipynb

Args:
messages: The list of messages to count the tokens of.

Returns:
The number of tokens in the messages, as expected to be counted by OpenAI models.
""" # noqa: E501
num_tokens = 0
for message in messages:
num_tokens += self.MESSAGE_TOKENS_OVERHEAD
Expand Down
108 changes: 105 additions & 3 deletions src/canopy/tokenizer/tokenizer.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,30 @@
from typing import List, Optional
from typing import List, Optional, Type

from .openai import OpenAITokenizer
from .base import BaseTokenizer
from ..models.data_models import Messages


class Tokenizer:

"""
Singleton class for tokenization.
The singleton behavior unify tokenization across the system.

Usage:

To initialize the tokenizer, call Tokenizer.initialize(tokenizer_class, *args, **kwargs)
>>> from canopy.tokenizer import Tokenizer
>>> Tokenizer.initialize()

Then, you can init a tokenizer instance by calling Tokenizer() from anywhere in the code and use it:
acatav marked this conversation as resolved.
Show resolved Hide resolved
>>> tokenizer = Tokenizer()
>>> tokenizer.tokenize("Hello world!")
['Hello', 'world', '!']
>>> tokenizer.detokenize(['Hello', 'world', '!'])
'Hello world!'
""" # noqa: E501

_instance = None
_tokenizer_instance: Optional[BaseTokenizer] = None
_initialized = False
Expand All @@ -20,7 +40,40 @@ def __new__(cls):
return cls._instance

@classmethod
def initialize(cls, tokenizer_class=DEFAULT_TOKENIZER_CLASS, **kwargs):
def initialize(cls,
tokenizer_class: Type[BaseTokenizer] = DEFAULT_TOKENIZER_CLASS,
**kwargs):
"""
Initialize the tokenizer singleton.

Args:
tokenizer_class: The tokenizer class to use. Must be a subclass of BaseTokenizer. Defaults to OpenAITokenizer.
**kwargs: Keyword arguments to pass to the tokenizer class constructor.
acatav marked this conversation as resolved.
Show resolved Hide resolved

Examples:
Initialize the tokenizer with the default tokenizer class:

>>> from canopy.tokenizer import Tokenizer
>>> Tokenizer.initialize()

Initialize the tokenizer with a custom tokenizer class:

>>> from canopy.tokenizer import Tokenizer
>>> from canopy.tokenizer.base import BaseTokenizer
>>> class MyTokenizer(BaseTokenizer):
... def tokenize(self, text: str) -> List[str]:
... return text.split()
... def detokenize(self, tokens: List[str]) -> str:
... return " ".join(tokens)
... def messages_token_count(self, messages) -> int:
... return sum([self.token_count(message) + 3 for message in messages])
>>> Tokenizer.initialize(MyTokenizer)

Then, you can init a tokenizer instance by calling Tokenizer() from anywhere in the code:
acatav marked this conversation as resolved.
Show resolved Hide resolved

>>> from canopy.tokenizer import Tokenizer
>>> tokenizer = Tokenizer()
""" # noqa: E501
if not issubclass(tokenizer_class, BaseTokenizer):
raise ValueError("Invalid tokenizer class provided")
if issubclass(tokenizer_class, Tokenizer):
Expand All @@ -30,26 +83,75 @@ def initialize(cls, tokenizer_class=DEFAULT_TOKENIZER_CLASS, **kwargs):

@classmethod
def clear(cls):
"""
Clear the tokenizer singleton.
"""
cls._instance = None
cls._tokenizer_instance = None
cls._initialized = False

@classmethod
def initialize_from_config(cls, config: dict):
"""
Initialize the tokenizer singleton from a config dictionary.
Used by the config module to initialize the tokenizer from a config file.

Args:
config: A dictionary containing the tokenizer configuration. Must contain a "type" key with the tokenizer class name.
acatav marked this conversation as resolved.
Show resolved Hide resolved
""" # noqa: E501
if cls._initialized:
raise ValueError("Tokenizer has already been initialized")
config["type"] = config.get("type", cls.DEFAULT_TOKENIZER_CLASS.__name__)
cls._tokenizer_instance = BaseTokenizer.from_config(config)
cls._initialized = True

def tokenize(self, text: str) -> List[str]:
"""
Splits a text into tokens.

Args:
text: The text to tokenize as a string.

Returns:
A list of tokens.
"""
return self._tokenizer_instance.tokenize(text) # type: ignore[union-attr]

def detokenize(self, tokens: List[str]) -> str:
"""
Joins a list of tokens into a text.

Args:
tokens: The tokens to join as a list of strings. Consider using tokenize() first.

Returns:
The joined text as a string.
""" # noqa: E501
return self._tokenizer_instance.detokenize(tokens) # type: ignore[union-attr]

def token_count(self, text: str) -> int:
"""
Counts the number of tokens in a text.

Args:
text: The text to count as a string.

Returns:
The number of tokens in the text.
"""
return self._tokenizer_instance.token_count(text) # type: ignore[union-attr]

def messages_token_count(self, messages) -> int:
def messages_token_count(self, messages: Messages) -> int:
"""
Counts the number of tokens in a Messages object.
Behind the scenes, for each LLM provider there might be a different overhead for each message in the prompt,
which is not necessarily the same as the number of tokens in the message text.
This method takes care of that overhead and returns the total number of tokens in the prompt, as counted by the LLM provider.

Args:
messages: The Messages object to count.

Returns:
The number of tokens in the Messages object.
""" # noqa: E501
return self._tokenizer_instance.messages_token_count(messages) # type: ignore[union-attr] # noqa: E501