pinecone-io · acatav · Nov 2, 2023 · Nov 1, 2023 · Nov 1, 2023 · Nov 1, 2023
diff --git a/src/canopy/tokenizer/openai.py b/src/canopy/tokenizer/openai.py
@@ -5,30 +5,89 @@
 
 
 class OpenAITokenizer(BaseTokenizer):
+    """
+    Tokenizer for OpenAI models, based on the tiktoken library.
+
+    Usage:
+    Initialize the singleton tokenizer with the OpenAITokenizer class:
+    >>> from canopy.tokenizer import Tokenizer
+    >>> Tokenizer.initialize(tokenizer_class=OpenAITokenizer, model_name="gpt-3.5-turbo")
+
+    You can then use the tokenizer instance from anywhere in the code:
+    >>> from canopy.tokenizer import Tokenizer
+    >>> tokenizer = Tokenizer()
+    >>> tokenizer.tokenize("Hello world!")
+    ['Hello', ' world', '!']
+    """  # noqa: E501
 
     MESSAGE_TOKENS_OVERHEAD = 3
     FIXED_PREFIX_TOKENS = 3
 
     def __init__(self, model_name: str = "gpt-3.5-turbo"):
+        """
+        Initialize the tokenizer.
+
+        Args:
+            model_name: The name of the model to use. Defaults to "gpt-3.5-turbo".
+                        You can find the list of available models here: https://github.com/openai/tiktoken/blob/39f29cecdb6fc38d9a3434e5dd15e4de58cf3c80/tiktoken/model.py#L19C1-L19C18
+                        As you can see, both gpt-3.5 and gpt-4 are using the same cl100k_base tokenizer.
+        """  # noqa: E501
         self._encoder = tiktoken.encoding_for_model(model_name)
 
     def tokenize(self, text: str) -> List[str]:
+        """
+        Tokenize a text using tiktoken.
+
+        Args:
+            text: The text to tokenize.
+
+        Returns:
+            The list of tokens.
+        """
         return [self._encoder.decode([encoded_token])
                 for encoded_token in self._encode(text)]
 
     def detokenize(self, tokens: List[str]) -> str:
+        """
+        Detokenize a list of tokens that were previously tokenized using this tokenizer.
+
+        Args:
+            tokens: The list of tokens to detokenize.
+
+        Returns:
+            The detokenized text as a string.
+        """
         if not isinstance(tokens, List):
             raise TypeError(f"detokenize expect List[str], got f{type(tokens)}")
         return "".join(tokens)
 
     def token_count(self, text: str) -> int:
+        """
+        Count the number of tokens in a text.
+
+        Args:
+            text: The text to count the tokens of.
+
+        Returns:
+            The number of tokens in the text.
+        """
         return len(self._encode(text))
 
     def _encode(self, text):
         return self._encoder.encode(text, disallowed_special=())
 
     def messages_token_count(self, messages: Messages) -> int:
-        # Adapted from: https://github.com/openai/openai-cookbook/.../How_to_format_inputs_to_ChatGPT_models.ipynb # noqa
+        """
+        Count the number of tokens in a list of messages as expected to be counted by OpenAI models.
+        Account for the overhead of the messages structure.
+        Taken from: https://github.com/openai/openai-cookbook/.../How_to_format_inputs_to_ChatGPT_models.ipynb
+
+        Args:
+            messages: The list of messages to count the tokens of.
+
+        Returns:
+            The number of tokens in the messages, as expected to be counted by OpenAI models.
+        """  # noqa: E501
         num_tokens = 0
         for message in messages:
             num_tokens += self.MESSAGE_TOKENS_OVERHEAD

diff --git a/src/canopy/tokenizer/tokenizer.py b/src/canopy/tokenizer/tokenizer.py
@@ -1,10 +1,30 @@
-from typing import List, Optional
+from typing import List, Optional, Type
 
 from .openai import OpenAITokenizer
 from .base import BaseTokenizer
+from ..models.data_models import Messages
 
 
 class Tokenizer:
+
+    """
+    Singleton class for tokenization.
+    The singleton behavior unify tokenization across the system.
+
+    Usage:
+
+    To initialize the tokenizer, call Tokenizer.initialize(tokenizer_class, *args, **kwargs)
+    >>> from canopy.tokenizer import Tokenizer
+    >>> Tokenizer.initialize()
+
+    Then, you can instantiate a tokenizer instance by calling Tokenizer() from anywhere in the code and use it:
+    >>> tokenizer = Tokenizer()
+    >>> tokenizer.tokenize("Hello world!")
+    ['Hello', 'world', '!']
+    >>> tokenizer.detokenize(['Hello', 'world', '!'])
+    'Hello world!'
+    """  # noqa: E501
+
     _instance = None
     _tokenizer_instance: Optional[BaseTokenizer] = None
     _initialized = False
@@ -20,7 +40,40 @@ def __new__(cls):
         return cls._instance
 
     @classmethod
-    def initialize(cls, tokenizer_class=DEFAULT_TOKENIZER_CLASS, **kwargs):
+    def initialize(cls,
+                   tokenizer_class: Type[BaseTokenizer] = DEFAULT_TOKENIZER_CLASS,
+                   **kwargs):
+        """
+        Initialize the tokenizer singleton.
+
+        Args:
+            tokenizer_class: The tokenizer class to use. Must be a subclass of BaseTokenizer. Defaults to OpenAITokenizer.
+            **kwargs: Keyword arguments to pass to the underlying `Tokenizer` class constructor.
+
+        Examples:
+            Initialize the tokenizer with the default tokenizer class:
+
+            >>> from canopy.tokenizer import Tokenizer
+            >>> Tokenizer.initialize()
+
+            Initialize the tokenizer with a custom tokenizer class:
+
+            >>> from canopy.tokenizer import Tokenizer
+            >>> from canopy.tokenizer.base import BaseTokenizer
+            >>> class MyTokenizer(BaseTokenizer):
+            ...     def tokenize(self, text: str) -> List[str]:
+            ...         return text.split()
+            ...     def detokenize(self, tokens: List[str]) -> str:
+            ...         return " ".join(tokens)
+            ...     def messages_token_count(self, messages) -> int:
+            ...         return sum([self.token_count(message) + 3 for message in messages])
+            >>> Tokenizer.initialize(MyTokenizer)
+
+            Then, you can instantiate a tokenizer instance by calling Tokenizer() from anywhere in the code:
+
+            >>> from canopy.tokenizer import Tokenizer
+            >>> tokenizer = Tokenizer()
+        """  # noqa: E501
         if not issubclass(tokenizer_class, BaseTokenizer):
             raise ValueError("Invalid tokenizer class provided")
         if issubclass(tokenizer_class, Tokenizer):
@@ -30,26 +83,83 @@ def initialize(cls, tokenizer_class=DEFAULT_TOKENIZER_CLASS, **kwargs):
 
     @classmethod
     def clear(cls):
+        """
+        Clear the tokenizer singleton.
+        """
         cls._instance = None
         cls._tokenizer_instance = None
         cls._initialized = False
 
     @classmethod
     def initialize_from_config(cls, config: dict):
+        """
+        Initialize the tokenizer singleton from a config dictionary.
+        Used by the config module to initialize the tokenizer from a config file.
+
+        Args:
+            config: A dictionary containing the tokenizer configuration. If not provided, the OpenAITokenizer will be used.
+
+        Usage:
+            >>> from canopy.tokenizer import Tokenizer
+            >>> config = {
+            ...     "type": "OpenAITokenizer",
+            ...     "model_name": "gpt2"
+            ... }
+            >>> Tokenizer.initialize_from_config(config)
+        """  # noqa: E501
         if cls._initialized:
             raise ValueError("Tokenizer has already been initialized")
         config["type"] = config.get("type", cls.DEFAULT_TOKENIZER_CLASS.__name__)
         cls._tokenizer_instance = BaseTokenizer.from_config(config)
         cls._initialized = True
 
     def tokenize(self, text: str) -> List[str]:
+        """
+        Splits a text into tokens.
+
+        Args:
+            text: The text to tokenize as a string.
+
+        Returns:
+            A list of tokens.
+        """
         return self._tokenizer_instance.tokenize(text)  # type: ignore[union-attr]
 
     def detokenize(self, tokens: List[str]) -> str:
+        """
+        Joins a list of tokens into a text.
+
+        Args:
+            tokens: The tokens to join as a list of strings. Consider using tokenize() first.
+
+        Returns:
+            The joined text as a string.
+        """  # noqa: E501
         return self._tokenizer_instance.detokenize(tokens)   # type: ignore[union-attr]
 
     def token_count(self, text: str) -> int:
+        """
+        Counts the number of tokens in a text.
+
+        Args:
+            text: The text to count as a string.
+
+        Returns:
+            The number of tokens in the text.
+        """
         return self._tokenizer_instance.token_count(text)   # type: ignore[union-attr]
 
-    def messages_token_count(self, messages) -> int:
+    def messages_token_count(self, messages: Messages) -> int:
+        """
+        Counts the number of tokens in a Messages object.
+        Behind the scenes, for each LLM provider there might be a different overhead for each message in the prompt,
+        which is not necessarily the same as the number of tokens in the message text.
+        This method takes care of that overhead and returns the total number of tokens in the prompt, as counted by the LLM provider.
+
+        Args:
+            messages: The Messages object to count.
+
+        Returns:
+            The number of tokens in the Messages object.
+        """  # noqa: E501
         return self._tokenizer_instance.messages_token_count(messages)   # type: ignore[union-attr] # noqa: E501