tokenizer_template_switch.py

import re
from transformers import AutoTokenizer

def extract_separators(template):
    """
    Extracts separators used in the tokenization template.
    """
    # Adjust the regex to correctly match the specific pattern between '{{' and '+ message["content"] +'
    pattern = r"\{\{\s*([^{}]+?)\s*\+ message\['content'\]"
    matches = re.findall(pattern, template)
    # Clean up any extra spaces and return the matches
    separators = [match.strip() for match in matches]

    if any("message['role']" in element for element in separators):
        roles = ["system", "user", "assistant"]
        separators_ = []
        for role in roles:
            separators_.append(separators[0].replace(" + message['role'] + ", role).replace("'",""))
        return separators_

    return separators

def detect_eos_token(jinja_template, tokenizer):
    if "<|im_end|>" in jinja_template:
        return "<|im_end|>"
    if "</s>" in jinja_template:
        return "</s>"
    if "eos_token" in jinja_template:
        return tokenizer.eos_token    
    else:
        return "<|endoftext|>"

def recover_messages(formatted_message, separators, eos_token):
    """
    Recovers the original messages from the formatted message string.
    """
    # Split the formatted message using the end-of-string token
    split_messages = formatted_message.split(eos_token)
    
    # Remove the last empty string if it exists due to a trailing separator
    if split_messages and split_messages[-1].strip() == '':
        split_messages.pop()

    # Prepare the list to hold the recovered messages
    recovered_messages = []

    # Define roles after the first message, alternating between "user" and "assistant"
    alternate_roles = ["user", "assistant"]
    
    # Iterate over the split messages
    for index, message_content in enumerate(split_messages):
        # Determine the role, starting with "system" for the first message
        # then alternating between "user" and "assistant" for subsequent messages
        if index == 0:
            role = "system"
        else:
            role = alternate_roles[(index - 1) % 2]

        # Clean the message content by removing leading/trailing whitespace and separators
        clean_content = message_content.strip()
        for separator in separators:
            clean_content = clean_content.replace(separator.strip("'"), '', 1).strip()

        # Append the cleaned message with its role to the list
        recovered_messages.append({"role": role, "content": clean_content})

    return recovered_messages

def recover_chat_messages(tokenized_chat, tokenizer):
    """
    Given a tokenized_chat string and a tokenizer, returns the list of message dictionaries.
    """
    jinja_template = tokenizer.chat_template
    separators = extract_separators(jinja_template)
    eos_token = eos_token = detect_eos_token(jinja_template, tokenizer)
    recovered_messages = recover_messages(tokenized_chat, separators, eos_token)
    return recovered_messages

# Example usage
if __name__ == "__main__":
    checkpoint = "Qwen/Qwen1.5-0.5B"
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    
    messages = [
        {
            "role": "system",
            "content": "You are a friendly chatbot who always responds in the style of a pirate",
        },
        {"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
    ]
    tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=False)
    print(tokenized_chat)
    
    recovered_messages = recover_chat_messages(tokenized_chat, tokenizer)
    print(recovered_messages)